# Publications

Publications with awards

Herrmann, M., & Webb, G. I.
Pattern Recognition, Art. no. 109333, in press.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{Herrmann2023a,
author = {Matthieu Herrmann and Geoffrey I. Webb},
journal = {Pattern Recognition},
title = {Amercing: An Intuitive and Effective Constraint for Dynamic Time Warping},
year = {in press},
issn = {0031-3203},
abstract = {Dynamic Time Warping (DTW) is a time series distance measure that allows non-linear alignments between series. Constraints on the alignments in the form of windows and weights have been introduced because unconstrained DTW is too permissive in its alignments. However, windowing introduces a crude step function, allowing unconstrained flexibility within the window, and none beyond it. While not entailing a step function, a multiplicative weight is relative to the distances between aligned points along a warped path, rather than being a direct function of the amount of warping that is introduced. In this paper, we introduce Amerced Dynamic Time Warping (ADTW), a new, intuitive, DTW variant that penalizes the act of warping by a fixed additive cost. Like windowing and weighting, ADTW constrains the amount of warping. However, it avoids both abrupt discontinuities in the amount of warping allowed and the limitations of a multiplicative penalty. We formally introduce ADTW, prove some of its properties, and discuss its parameterization. We show on a simple example how it can be parameterized to achieve an intuitive outcome, and demonstrate its usefulness on a standard time series classification benchmark. We provide a demonstration application in C++ [1].},
articlenumber = {109333},
doi = {10.1016/j.patcog.2023.109333},
keywords = {Time Series, Dynamic Time Warping, Elastic Distance},
related = {scalable-time-series-classifiers},
}
ABSTRACT Dynamic Time Warping (DTW) is a time series distance measure that allows non-linear alignments between series. Constraints on the alignments in the form of windows and weights have been introduced because unconstrained DTW is too permissive in its alignments. However, windowing introduces a crude step function, allowing unconstrained flexibility within the window, and none beyond it. While not entailing a step function, a multiplicative weight is relative to the distances between aligned points along a warped path, rather than being a direct function of the amount of warping that is introduced. In this paper, we introduce Amerced Dynamic Time Warping (ADTW), a new, intuitive, DTW variant that penalizes the act of warping by a fixed additive cost. Like windowing and weighting, ADTW constrains the amount of warping. However, it avoids both abrupt discontinuities in the amount of warping allowed and the limitations of a multiplicative penalty. We formally introduce ADTW, prove some of its properties, and discuss its parameterization. We show on a simple example how it can be parameterized to achieve an intuitive outcome, and demonstrate its usefulness on a standard time series classification benchmark. We provide a demonstration application in C++ [1].

Lucas, B., Pelletier, C., Schmidt, D., Webb, G. I., & Petitjean, F.
Machine Learning, in press.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{lucas2021bayesian,
author = {Lucas, Benjamin and Pelletier, Charlotte and Schmidt, Daniel and Webb, Geoffrey I and Petitjean, Fran{\c{c}}ois},
journal = {Machine Learning},
title = {A Bayesian-inspired, deep learning-based, semi-supervised domain adaptation technique for land cover mapping},
year = {in press},
abstract = {Land cover maps are a vital input variable to many types of environmental research and management. While they can be produced automatically by machine learning techniques, these techniques require substantial training data to achieve high levels of accuracy, which are not always available. One technique researchers use when labelled training data are scarce is domain adaptation (DA) - where data from an alternate region, known as the source domain, are used to train a classifier and this model is adapted to map the study region, or target domain. The scenario we address in this paper is known as semi-supervised DA, where some labelled samples are available in the target domain. In this paper we present Sourcerer, a Bayesian-inspired, deep learning-based, semi-supervised DA technique for producing land cover maps from satellite image time series (SITS) data. The technique takes a convolutional neural network trained on a source domain and then trains further on the available target domain with a novel regularizer applied to the model weights. The regularizer adjusts the degree to which the model is modified to fit the target data, limiting the degree of change when the target data are few in number and increasing it as target data quantity increases. Our experiments on Sentinel-2 time series images compare Sourcerer with two state-of-the-art semi-supervised domain adaptation techniques and four baseline models. We show that on two different source-target domain pairings Sourcerer outperforms all other methods for any quantity of labelled target data available. In fact, the results on the more difficult target domain show that the starting accuracy of Sourcerer (when no labelled target data are available), 74.2%, is greater than the next-best state-of-the-art method trained on 20,000 labelled target instances.},
doi = {10.1007/s10994-020-05942-z},
keywords = {time series, earth observation analytics},
publisher = {Springer US},
related = {earth-observation-analytics},
}
ABSTRACT Land cover maps are a vital input variable to many types of environmental research and management. While they can be produced automatically by machine learning techniques, these techniques require substantial training data to achieve high levels of accuracy, which are not always available. One technique researchers use when labelled training data are scarce is domain adaptation (DA) - where data from an alternate region, known as the source domain, are used to train a classifier and this model is adapted to map the study region, or target domain. The scenario we address in this paper is known as semi-supervised DA, where some labelled samples are available in the target domain. In this paper we present Sourcerer, a Bayesian-inspired, deep learning-based, semi-supervised DA technique for producing land cover maps from satellite image time series (SITS) data. The technique takes a convolutional neural network trained on a source domain and then trains further on the available target domain with a novel regularizer applied to the model weights. The regularizer adjusts the degree to which the model is modified to fit the target data, limiting the degree of change when the target data are few in number and increasing it as target data quantity increases. Our experiments on Sentinel-2 time series images compare Sourcerer with two state-of-the-art semi-supervised domain adaptation techniques and four baseline models. We show that on two different source-target domain pairings Sourcerer outperforms all other methods for any quantity of labelled target data available. In fact, the results on the more difficult target domain show that the starting accuracy of Sourcerer (when no labelled target data are available), 74.2%, is greater than the next-best state-of-the-art method trained on 20,000 labelled target instances.

Godahewa, R., Bergmeir, C., Webb, G. I., & Montero-Manso, P.
International Journal of Forecasting, in press.
[Bibtex] [Abstract]  → Access on publisher site

@Article{GODAHEWA2022,
author = {Rakshitha Godahewa and Christoph Bergmeir and Geoffrey I. Webb and Pablo Montero-Manso},
journal = {International Journal of Forecasting},
title = {An accurate and fully-automated ensemble model for weekly time series forecasting},
year = {in press},
issn = {0169-2070},
abstract = {Many businesses and industries require accurate forecasts for weekly time series nowadays. However, the forecasting literature does not currently provide easy-to-use, automatic, reproducible and accurate approaches dedicated to this task. We propose a forecasting method in this domain to fill this gap, leveraging state-of-the-art forecasting techniques, such as forecast combination, meta-learning, and global modelling. We consider different meta-learning architectures, algorithms, and base model pools. Based on all considered model variants, we propose to use a stacking approach with lasso regression which optimally combines the forecasts of four base models: a global Recurrent Neural Network (RNN) model, Theta, Trigonometric Box–Cox ARMA Trend Seasonal (TBATS), and Dynamic Harmonic Regression ARIMA (DHR-ARIMA), as it shows the overall best performance across seven experimental weekly datasets on four evaluation metrics. Our proposed method also consistently outperforms a set of benchmarks and state-of-the-art weekly forecasting models by a considerable margin with statistical significance. Our method can produce the most accurate forecasts, in terms of mean sMAPE, for the M4 weekly dataset among all benchmarks and all original competition participants.},
doi = {10.1016/j.ijforecast.2022.01.008},
keywords = {Weekly forecasting, Global models, Ensembling, Meta-learning, Time series forecasting},
}
ABSTRACT Many businesses and industries require accurate forecasts for weekly time series nowadays. However, the forecasting literature does not currently provide easy-to-use, automatic, reproducible and accurate approaches dedicated to this task. We propose a forecasting method in this domain to fill this gap, leveraging state-of-the-art forecasting techniques, such as forecast combination, meta-learning, and global modelling. We consider different meta-learning architectures, algorithms, and base model pools. Based on all considered model variants, we propose to use a stacking approach with lasso regression which optimally combines the forecasts of four base models: a global Recurrent Neural Network (RNN) model, Theta, Trigonometric Box–Cox ARMA Trend Seasonal (TBATS), and Dynamic Harmonic Regression ARIMA (DHR-ARIMA), as it shows the overall best performance across seven experimental weekly datasets on four evaluation metrics. Our proposed method also consistently outperforms a set of benchmarks and state-of-the-art weekly forecasting models by a considerable margin with statistical significance. Our method can produce the most accurate forecasts, in terms of mean sMAPE, for the M4 weekly dataset among all benchmarks and all original competition participants.

Tan, C. W., Herrmann, M., & Webb, G. I.
Knowledge and Information Systems, 2023.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Tan2023,
author = {Tan, Chang Wei and Herrmann, Matthieu and Webb, Geoffrey I.},
journal = {Knowledge and Information Systems},
title = {Ultra-fast meta-parameter optimization for time series similarity measures with application to nearest neighbour classification},
year = {2023},
doi = {10.1007/s10115-022-01827-w},
keywords = {time series},
publisher = {Springer Science and Business Media {LLC}},
related = {scalable-time-series-classifiers},
}
ABSTRACT 

Jung, M., Lukose, D., Nielsen, S., Bell, S. J., Webb, G. I., & Ilomaki, J.
British Journal of Clinical Pharmacology, 2023.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Jung,
author = {Jung, Monica and Lukose, Dickson and Nielsen, Suzanne and Bell, J. Simon and Webb, Geoffrey I. and Ilomaki, Jenni},
journal = {British Journal of Clinical Pharmacology},
title = {COVID-19 restrictions and the incidence and prevalence of prescription opioid use in Australia – a nation-wide study},
year = {2023},
abstract = {The COVID-19 pandemic has disrupted seeking and delivery of healthcare. Different Australian jurisdictions implemented different COVID-19 restrictions. We used Australian national pharmacy dispensing data to conduct interrupted time series analyses to examine the incidence and prevalence of opioid dispensing in different jurisdictions. Following nationwide COVID-19 restrictions, the incidence dropped by -0.40 [-0.50, -0.31], -0.33 [-0.46, -0.21] and -0.21 [-0.37, -0.04] /1000 people/week and prevalence dropped by -0.85 [-1.39, -0.31], -0.54 [-1.01, -0.07] and -0.62 [-0.99, -0.25] /1000 people/week in Victoria, New South Wales and other jurisdictions, respectively. Incidence and prevalence increased by 0.29 [0.13, 0.44] and 0.72 [0.11, 1.33] /1000 people/week, respectively in Victoria post-lockdown; no significant changes were observed in other jurisdictions. No significant changes were observed in the initiation of long-term opioid use in any jurisdictions. More stringent restrictions coincided with more pronounced reductions in overall opioid initiation, but initiation of long-term opioid use did not change.},
doi = {10.1111/bcp.15577},
keywords = {health, opioids, chronic pain, drug utilisation, medication safety, quality use of medicines},
related = {health},
}
ABSTRACT The COVID-19 pandemic has disrupted seeking and delivery of healthcare. Different Australian jurisdictions implemented different COVID-19 restrictions. We used Australian national pharmacy dispensing data to conduct interrupted time series analyses to examine the incidence and prevalence of opioid dispensing in different jurisdictions. Following nationwide COVID-19 restrictions, the incidence dropped by -0.40 [-0.50, -0.31], -0.33 [-0.46, -0.21] and -0.21 [-0.37, -0.04] /1000 people/week and prevalence dropped by -0.85 [-1.39, -0.31], -0.54 [-1.01, -0.07] and -0.62 [-0.99, -0.25] /1000 people/week in Victoria, New South Wales and other jurisdictions, respectively. Incidence and prevalence increased by 0.29 [0.13, 0.44] and 0.72 [0.11, 1.33] /1000 people/week, respectively in Victoria post-lockdown; no significant changes were observed in other jurisdictions. No significant changes were observed in the initiation of long-term opioid use in any jurisdictions. More stringent restrictions coincided with more pronounced reductions in overall opioid initiation, but initiation of long-term opioid use did not change.

Bi, Y., Li, F., Guo, X., Wang, Z., Pan, T., Guo, Y., Webb, G. I., Yao, J., Jia, C., & Song, J.
Briefings in Bioinformatics, 23(6), Art. no. bbac467, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Bi2022,
author = {Bi, Yue and Li, Fuyi and Guo, Xudong and Wang, Zhikang and Pan, Tong and Guo, Yuming and Webb, Geoffrey I and Yao, Jianhua and Jia, Cangzhi and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {{Clarion is a multi-label problem transformation method for identifying mRNA subcellular localizations}},
year = {2022},
issn = {1477-4054},
month = {11},
number = {6},
volume = {23},
abstract = {{Subcellular localization of messenger RNAs (mRNAs) plays a key role in the spatial regulation of gene activity. The functions of mRNAs have been shown to be closely linked with their localizations. As such, understanding of the subcellular localizations of mRNAs can help elucidate gene regulatory networks. Despite several computational methods that have been developed to predict mRNA localizations within cells, there is still much room for improvement in predictive performance, especially for the multiple-location prediction. In this study, we proposed a novel multi-label multi-class predictor, termed Clarion, for mRNA subcellular localization prediction. Clarion was developed based on a manually curated benchmark dataset and leveraged the weighted series method for multi-label transformation. Extensive benchmarking tests demonstrated Clarion achieved competitive predictive performance and the weighted series method plays a crucial role in securing superior performance of Clarion. In addition, the independent test results indicate that Clarion outperformed the state-of-the-art methods and can secure accuracy of 81.47, 91.29, 79.77, 92.10, 89.15, 83.74, 80.74, 79.23 and 84.74% for chromatin, cytoplasm, cytosol, exosome, membrane, nucleolus, nucleoplasm, nucleus and ribosome, respectively. The webserver and local stand-alone tool of Clarion is freely available at http://monash.bioweb.cloud.edu.au/Clarion/.}},
articlenumber = {bbac467},
doi = {10.1093/bib/bbac467},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Subcellular localization of messenger RNAs (mRNAs) plays a key role in the spatial regulation of gene activity. The functions of mRNAs have been shown to be closely linked with their localizations. As such, understanding of the subcellular localizations of mRNAs can help elucidate gene regulatory networks. Despite several computational methods that have been developed to predict mRNA localizations within cells, there is still much room for improvement in predictive performance, especially for the multiple-location prediction. In this study, we proposed a novel multi-label multi-class predictor, termed Clarion, for mRNA subcellular localization prediction. Clarion was developed based on a manually curated benchmark dataset and leveraged the weighted series method for multi-label transformation. Extensive benchmarking tests demonstrated Clarion achieved competitive predictive performance and the weighted series method plays a crucial role in securing superior performance of Clarion. In addition, the independent test results indicate that Clarion outperformed the state-of-the-art methods and can secure accuracy of 81.47, 91.29, 79.77, 92.10, 89.15, 83.74, 80.74, 79.23 and 84.74% for chromatin, cytoplasm, cytosol, exosome, membrane, nucleolus, nucleoplasm, nucleus and ribosome, respectively. The webserver and local stand-alone tool of Clarion is freely available at http://monash.bioweb.cloud.edu.au/Clarion/.}

Iqbal, S., Ge, F., Li, F., Akutsu, T., Zheng, Y., Gasser, R. B., Yu, D., Webb, G. I., & Song, J.
Journal of Chemical Information and Modeling, 62(17), 4270-4282, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Iqbal2022,
author = {Shahid Iqbal and Fang Ge and Fuyi Li and Tatsuya Akutsu and Yuanting Zheng and Robin B. Gasser and Dong-Jun Yu and Geoffrey I. Webb and Jiangning Song},
journal = {Journal of Chemical Information and Modeling},
title = {{PROST}: {AlphaFold}2-aware Sequence-Based Predictor to Estimate Protein Stability Changes upon Missense Mutations},
year = {2022},
number = {17},
pages = {4270-4282},
volume = {62},
abstract = {An essential step in engineering proteins and understanding disease-causing missense mutations is to accurately model protein stability changes when such mutations occur. Here, we developed a new sequence-based predictor for protein stability (PROST) change (∆∆G) upon single-point missense mutation. PROST extracts multiple descriptors from the most promising sequence-based predictors, such as BoostDDG, SAAFEC-SEQ, and DDGun. RPOST also extracts descriptors from iFeature and AlphaFold2. The extracted descriptors include sequence-based features, physicochemical properties, evolutionary information, evolutionary-based physicochemical properties, and predicted structural features. The PROST predictor is a weighted average ensemble model based on extreme gradient boosting (XGBoost) decision trees and extra-trees regressor, PROST is trained on both direct and hypothetical reverse mutations using the S5294 (S2647 direct mutations + S2647 inverse mutations). The parameters for the PROST model are optimized using grid searching with 5-fold cross-validation, and feature importance analysis unveils the most relevant features. The performance of PROST is evaluated in a blinded manner, employing nine distinct datasets and existing state-of-the-art sequence-based and structure-based predictors. This method consistently performs well on Frataxin, S217, S349, Ssym, Myoglobin, and CAGI5 datasets in blind tests, and similarly to the state-of-the-art predictors for p53 and S276 datasets. When the performance of PROST is compared with the latest predictors such as BoostDDG, SAAFEC-SEQ, ACDC-NN-seq, and DDGun, PROST dominates these predictors. A case study of mutation scanning of the Frataxin protein for nine wild-type residues demonstrates the utility of PROST. Taken together, these findings indicate that PROST is a well-suited predictor when no protein structural information is available. The source code of PROST, datasets, examples, pre-trained models, along with how to use PROST are available at https://github.com/ShahidIqb/PROST and https://prost.erc.monash.edu/seq.},
doi = {10.1021/acs.jcim.2c00799},
keywords = {Bioinformatics},
publisher = {American Chemical Society ({ACS})},
related = {computational-biology},
}
ABSTRACT An essential step in engineering proteins and understanding disease-causing missense mutations is to accurately model protein stability changes when such mutations occur. Here, we developed a new sequence-based predictor for protein stability (PROST) change (∆∆G) upon single-point missense mutation. PROST extracts multiple descriptors from the most promising sequence-based predictors, such as BoostDDG, SAAFEC-SEQ, and DDGun. RPOST also extracts descriptors from iFeature and AlphaFold2. The extracted descriptors include sequence-based features, physicochemical properties, evolutionary information, evolutionary-based physicochemical properties, and predicted structural features. The PROST predictor is a weighted average ensemble model based on extreme gradient boosting (XGBoost) decision trees and extra-trees regressor, PROST is trained on both direct and hypothetical reverse mutations using the S5294 (S2647 direct mutations + S2647 inverse mutations). The parameters for the PROST model are optimized using grid searching with 5-fold cross-validation, and feature importance analysis unveils the most relevant features. The performance of PROST is evaluated in a blinded manner, employing nine distinct datasets and existing state-of-the-art sequence-based and structure-based predictors. This method consistently performs well on Frataxin, S217, S349, Ssym, Myoglobin, and CAGI5 datasets in blind tests, and similarly to the state-of-the-art predictors for p53 and S276 datasets. When the performance of PROST is compared with the latest predictors such as BoostDDG, SAAFEC-SEQ, ACDC-NN-seq, and DDGun, PROST dominates these predictors. A case study of mutation scanning of the Frataxin protein for nine wild-type residues demonstrates the utility of PROST. Taken together, these findings indicate that PROST is a well-suited predictor when no protein structural information is available. The source code of PROST, datasets, examples, pre-trained models, along with how to use PROST are available at https://github.com/ShahidIqb/PROST and https://prost.erc.monash.edu/seq.

Pialla, G., Fawaz, H. I., Devanne, M., Weber, J., Idoumghar, L., Muller, P., Bergmeir, C., Schmidt, D., Webb, G. I., & Forestier, G.
Proceedings of the 2022 Pacific-Asia Conference on Knowledge Discovery and Data Mining, Cham, pp. 485-496, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{10.1007/978-3-031-05933-9_38,
author = {Pialla, Gautier and Fawaz, Hassan Ismail and Devanne, Maxime and Weber, Jonathan and Idoumghar, Lhassane and Muller, Pierre-Alain and Bergmeir, Christoph and Schmidt, Daniel and Webb, Geoffrey I. and Forestier, Germain},
booktitle = {Proceedings of the 2022 Pacific-Asia Conference on Knowledge Discovery and Data Mining},
title = {Smooth Perturbations for Time Series Adversarial Attacks},
year = {2022},
editor = {Gama, Jo{\~a}o and Li, Tianrui and Yu, Yang and Chen, Enhong and Zheng, Yu and Teng, Fei},
pages = {485-496},
publisher = {Springer International Publishing},
abstract = {Adversarial attacks represent a threat to every deep neural network. They are particularly effective if they can perturb a given model while remaining undetectable. They have been initially introduced for image classifiers, and are well studied for this task. For time series, few attacks have yet been proposed. Most that have are adaptations of attacks previously proposed for image classifiers. Although these attacks are effective, they generate perturbations containing clearly discernible patterns such as sawtooth and spikes. Adversarial patterns are not perceptible on images, but the attacks proposed to date are readily perceptible in the case of time series. In order to generate stealthier adversarial attacks for time series, we propose a new attack that produces smoother perturbations. We find that smooth perturbations are harder to detect by the naked eye. We also show how adversarial training can improve model robustness against this attack, thus making models less vulnerable.},
doi = {10.1007/978-3-031-05933-9_38},
isbn = {978-3-031-05933-9},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT Adversarial attacks represent a threat to every deep neural network. They are particularly effective if they can perturb a given model while remaining undetectable. They have been initially introduced for image classifiers, and are well studied for this task. For time series, few attacks have yet been proposed. Most that have are adaptations of attacks previously proposed for image classifiers. Although these attacks are effective, they generate perturbations containing clearly discernible patterns such as sawtooth and spikes. Adversarial patterns are not perceptible on images, but the attacks proposed to date are readily perceptible in the case of time series. In order to generate stealthier adversarial attacks for time series, we propose a new attack that produces smoother perturbations. We find that smooth perturbations are harder to detect by the naked eye. We also show how adversarial training can improve model robustness against this attack, thus making models less vulnerable.

Chen, Z., Liu, X., Li, F., Li, C., Marquez-Lago, T., Leier, A., Webb, G. I., Xu, D., Akutsu, T., & Song, J.
In KC, D. B. (Ed.), In Computational Methods for Predicting Post-Translational Modification Sites (, pp. 205-219). New York, NY: Springer US, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InBook{Chen2022,
author = {Chen, Zhen and Liu, Xuhan and Li, Fuyi and Li, Chen and Marquez-Lago, Tatiana and Leier, Andr{\'e} and Webb, Geoffrey I. and Xu, Dakang and Akutsu, Tatsuya and Song, Jiangning},
editor = {KC, Dukka B.},
pages = {205-219},
publisher = {Springer US},
title = {Systematic Characterization of Lysine Post-translational Modification Sites Using MUscADEL},
year = {2022},
isbn = {978-1-0716-2317-6},
abstract = {Among various types of protein post-translational modifications (PTMs), lysineLysinesPTMs play an important role in regulating a wide range of functions and biological processes. Due to the generation and accumulation of enormous amount of protein sequence data by ongoing whole-genome sequencing projects, systematic identification of different types of lysineLysinesPTMPost-translational modification (PTM)substrates and their specific PTMPost-translational modification (PTM)sites in the entire proteome is increasingly important and has therefore received much attention. Accordingly, a variety of computational methods for lysineLysinesPTMPost-translational modification (PTM)identification have been developed based on the combination of various handcrafted sequence features and machine-learning techniques. In this chapter, we first briefly review existing computational methods for lysineLysinesPTMPost-translational modification (PTM)identification and then introduce a recently developed deep learning-based method, termed MUscADELMUscADEL (Multiple Scalable Accurate Deep Learner for lysineLysinesPTMs). Specifically, MUscADELMUscADEL employs bidirectional long short-term memoryLong short-term memory (LSTM) (BiLSTM) recurrent neural networks and is capable of predicting eight major types of lysineLysinesPTMs in both the human and mouse proteomes. The web server of MUscADELMUscADEL is publicly available at http://muscadel.erc.monash.edu/for the research community to use.},
booktitle = {Computational Methods for Predicting Post-Translational Modification Sites},
doi = {10.1007/978-1-0716-2317-6_11},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Among various types of protein post-translational modifications (PTMs), lysineLysinesPTMs play an important role in regulating a wide range of functions and biological processes. Due to the generation and accumulation of enormous amount of protein sequence data by ongoing whole-genome sequencing projects, systematic identification of different types of lysineLysinesPTMPost-translational modification (PTM)substrates and their specific PTMPost-translational modification (PTM)sites in the entire proteome is increasingly important and has therefore received much attention. Accordingly, a variety of computational methods for lysineLysinesPTMPost-translational modification (PTM)identification have been developed based on the combination of various handcrafted sequence features and machine-learning techniques. In this chapter, we first briefly review existing computational methods for lysineLysinesPTMPost-translational modification (PTM)identification and then introduce a recently developed deep learning-based method, termed MUscADELMUscADEL (Multiple Scalable Accurate Deep Learner for lysineLysinesPTMs). Specifically, MUscADELMUscADEL employs bidirectional long short-term memoryLong short-term memory (LSTM) (BiLSTM) recurrent neural networks and is capable of predicting eight major types of lysineLysinesPTMs in both the human and mouse proteomes. The web server of MUscADELMUscADEL is publicly available at http://muscadel.erc.monash.edu/for the research community to use.

Miller, L., Zhu, L., Yebra, M., Rüdiger, C., & Webb, G. I.
Environmental Modelling & Software, 105467, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Miller2022,
author = {Lynn Miller and Liujun Zhu and Marta Yebra and Christoph Rüdiger and Geoffrey I. Webb},
journal = {Environmental Modelling & Software},
title = {Multi-modal temporal CNNs for live fuel moisture content estimation},
year = {2022},
issn = {1364-8152},
pages = {105467},
abstract = {Live fuel moisture content (LFMC) is an important environmental indicator used to measure vegetation conditions and monitor for high fire risk conditions. However, LFMC is challenging to measure on a wide scale, thus reliable models for estimating LFMC are needed. Therefore, this paper proposes a new deep learning architecture for LFMC estimation. The architecture comprises an ensemble of temporal convolutional neural networks that learn from year-long time series of meteorological and reflectance data, and a few auxiliary inputs including the climate zone. LFMC estimation models are designed for two training and evaluation scenarios, one for sites where historical LFMC measurements are available (within-site), the other for sites without historical LFMC measurements (out-of-site). The models were trained and evaluated using a large database of LFMC samples measured in the field from 2001 to 2017 and achieved an RMSE of 20.87% for the within-site scenario and 25.36% for the out-of-site scenario.},
creationdate = {2022-08-18T11:50:53},
doi = {10.1016/j.envsoft.2022.105467},
keywords = {Live fuel moisture content, MODIS, Convolutional neural network, Time series analysis, Fire risk, Deep learning ensembles},
related = {scalable-time-series-classifiers},
}
ABSTRACT Live fuel moisture content (LFMC) is an important environmental indicator used to measure vegetation conditions and monitor for high fire risk conditions. However, LFMC is challenging to measure on a wide scale, thus reliable models for estimating LFMC are needed. Therefore, this paper proposes a new deep learning architecture for LFMC estimation. The architecture comprises an ensemble of temporal convolutional neural networks that learn from year-long time series of meteorological and reflectance data, and a few auxiliary inputs including the climate zone. LFMC estimation models are designed for two training and evaluation scenarios, one for sites where historical LFMC measurements are available (within-site), the other for sites without historical LFMC measurements (out-of-site). The models were trained and evaluated using a large database of LFMC samples measured in the field from 2001 to 2017 and achieved an RMSE of 20.87% for the within-site scenario and 25.36% for the out-of-site scenario.

Wang, X., Li, F., Xu, J., Rong, J., Webb, G. I., Ge, Z., Li, J., & Song, J.
Briefings in Bioinformatics, 23(2), Art. no. bbac031, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbac031,
author = {Wang, Xiaoyu and Li, Fuyi and Xu, Jing and Rong, Jia and Webb, Geoffrey I and Ge, Zongyuan and Li, Jian and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {{ASPIRER}: a new computational approach for identifying non-classical secreted proteins based on deep learning},
year = {2022},
issn = {1477-4054},
number = {2},
volume = {23},
abstract = {{Protein secretion has a pivotal role in many biological processes and is particularly important for intercellular communication, from the cytoplasm to the host or external environment. Gram-positive bacteria can secrete proteins through multiple secretion pathways. The non-classical secretion pathway has recently received increasing attention among these secretion pathways, but its exact mechanism remains unclear. Non-classical secreted proteins (NCSPs) are a class of secreted proteins lacking signal peptides and motifs. Several NCSP predictors have been proposed to identify NCSPs and most of them employed the whole amino acid sequence of NCSPs to construct the model. However, the sequence length of different proteins varies greatly. In addition, not all regions of the protein are equally important and some local regions are not relevant to the secretion. The functional regions of the protein, particularly in the N- and C-terminal regions, contain important determinants for secretion. In this study, we propose a new hybrid deep learning-based framework, referred to as ASPIRER, which improves the prediction of NCSPs from amino acid sequences. More specifically, it combines a whole sequence-based XGBoost model and an N-terminal sequence-based convolutional neural network model; 5-fold cross-validation and independent tests demonstrate that ASPIRER achieves superior performance than existing state-of-the-art approaches. The source code and curated datasets of ASPIRER are publicly available at https://github.com/yanwu20/ASPIRER/. ASPIRER is anticipated to be a useful tool for improved prediction of novel putative NCSPs from sequences information and prioritization of candidate proteins for follow-up experimental validation.}},
articlenumber = {bbac031},
doi = {10.1093/bib/bbac031},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Protein secretion has a pivotal role in many biological processes and is particularly important for intercellular communication, from the cytoplasm to the host or external environment. Gram-positive bacteria can secrete proteins through multiple secretion pathways. The non-classical secretion pathway has recently received increasing attention among these secretion pathways, but its exact mechanism remains unclear. Non-classical secreted proteins (NCSPs) are a class of secreted proteins lacking signal peptides and motifs. Several NCSP predictors have been proposed to identify NCSPs and most of them employed the whole amino acid sequence of NCSPs to construct the model. However, the sequence length of different proteins varies greatly. In addition, not all regions of the protein are equally important and some local regions are not relevant to the secretion. The functional regions of the protein, particularly in the N- and C-terminal regions, contain important determinants for secretion. In this study, we propose a new hybrid deep learning-based framework, referred to as ASPIRER, which improves the prediction of NCSPs from amino acid sequences. More specifically, it combines a whole sequence-based XGBoost model and an N-terminal sequence-based convolutional neural network model; 5-fold cross-validation and independent tests demonstrate that ASPIRER achieves superior performance than existing state-of-the-art approaches. The source code and curated datasets of ASPIRER are publicly available at https://github.com/yanwu20/ASPIRER/. ASPIRER is anticipated to be a useful tool for improved prediction of novel putative NCSPs from sequences information and prioritization of candidate proteins for follow-up experimental validation.}

Tan, C. W., Dempster, A., Bergmeir, C., & Webb, G. I.
Data Mining and Knowledge Discovery, 36, 1623–1646, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Tan2022,
author = {Tan, Chang Wei and Dempster, Angus and Bergmeir, Christoph and Webb, Geoffrey I.},
journal = {Data Mining and Knowledge Discovery},
title = {MultiRocket: multiple pooling operators and transformations for fast and effective time series classification},
year = {2022},
issn = {1573-756X},
pages = {1623–1646},
volume = {36},
abstract = {We propose MultiRocket, a fast time series classification (TSC) algorithm that achieves state-of-the-art accuracy with a tiny fraction of the time and without the complex ensembling structure of many state-of-the-art methods. MultiRocket improves on MiniRocket, one of the fastest TSC algorithms to date, by adding multiple pooling operators and transformations to improve the diversity of the features generated. In addition to processing the raw input series, MultiRocket also applies first order differences to transform the original series. Convolutions are applied to both representations, and four pooling operators are applied to the convolution outputs. When benchmarked using the University of California Riverside TSC benchmark datasets, MultiRocket is significantly more accurate than MiniRocket, and competitive with the best ranked current method in terms of accuracy, HIVE-COTE 2.0, while being orders of magnitude faster.},
doi = {10.1007/s10618-022-00844-1},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT We propose MultiRocket, a fast time series classification (TSC) algorithm that achieves state-of-the-art accuracy with a tiny fraction of the time and without the complex ensembling structure of many state-of-the-art methods. MultiRocket improves on MiniRocket, one of the fastest TSC algorithms to date, by adding multiple pooling operators and transformations to improve the diversity of the features generated. In addition to processing the raw input series, MultiRocket also applies first order differences to transform the original series. Convolutions are applied to both representations, and four pooling operators are applied to the convolution outputs. When benchmarked using the University of California Riverside TSC benchmark datasets, MultiRocket is significantly more accurate than MiniRocket, and competitive with the best ranked current method in terms of accuracy, HIVE-COTE 2.0, while being orders of magnitude faster.

Wang, Y., Wang, Y. G., Hu, C., Li, M., Fan, Y., Otter, N., Sam, I., Gou, H., Hu, Y., Kwok, T., Zalcberg, J., Boussioutas, A., Daly, R. J., Montúfar, G., Liò, P., Xu, D., Webb, G. I., & Song, J.
npj Precision Oncology, 6(1), Art. no. 45, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Wang2022,
author = {Wang, Yanan and Wang, Yu Guang and Hu, Changyuan and Li, Ming and Fan, Yanan and Otter, Nina and Sam, Ikuan and Gou, Hongquan and Hu, Yiqun and Kwok, Terry and Zalcberg, John and Boussioutas, Alex and Daly, Roger J. and Montúfar, Guido and Liò, Pietro and Xu, Dakang and Webb, Geoffrey I. and Song, Jiangning},
journal = {npj Precision Oncology},
title = {Cell graph neural networks enable the precise prediction of patient survival in gastric cancer},
year = {2022},
issn = {2397-768X},
number = {1},
volume = {6},
abstract = {Gastric cancer is one of the deadliest cancers worldwide. An accurate prognosis is essential for effective clinical assessment and treatment. Spatial patterns in the tumor microenvironment (TME) are conceptually indicative of the staging and progression of gastric cancer patients. Using spatial patterns of the TME by integrating and transforming the multiplexed immunohistochemistry (mIHC) images as Cell-Graphs, we propose a graph neural network-based approach, termed Cell−Graph Signature or CGSignature, powered by artificial intelligence, for the digital staging of TME and precise prediction of patient survival in gastric cancer. In this study, patient survival prediction is formulated as either a binary (short-term and long-term) or ternary (short-term, medium-term, and long-term) classification task. Extensive benchmarking experiments demonstrate that the CGSignature achieves outstanding model performance, with Area Under the Receiver Operating Characteristic curve of 0.960 +/- 0.01, and 0.771+/-0.024 to 0.904+/-0.012 for the binary- and ternary-classification, respectively. Moreover, Kaplan-Meier survival analysis indicates that the 'digital grade' cancer staging produced by CGSignature provides a remarkable capability in discriminating both binary and ternary classes with statistical significance (P value < 0.0001), significantly outperforming the AJCC 8th edition Tumor Node Metastasis staging system. Using Cell-Graphs extracted from mIHC images, CGSignature improves the assessment of the link between the TME spatial patterns and patient prognosis. Our study suggests the feasibility and benefits of such an artificial intelligence-powered digital staging system in diagnostic pathology and precision oncology.},
articlenumber = {45},
doi = {10.1038/s41698-022-00285-5},
keywords = {health},
related = {health},
url = {https://rdcu.be/cQeFD},
}
ABSTRACT Gastric cancer is one of the deadliest cancers worldwide. An accurate prognosis is essential for effective clinical assessment and treatment. Spatial patterns in the tumor microenvironment (TME) are conceptually indicative of the staging and progression of gastric cancer patients. Using spatial patterns of the TME by integrating and transforming the multiplexed immunohistochemistry (mIHC) images as Cell-Graphs, we propose a graph neural network-based approach, termed Cell−Graph Signature or CGSignature, powered by artificial intelligence, for the digital staging of TME and precise prediction of patient survival in gastric cancer. In this study, patient survival prediction is formulated as either a binary (short-term and long-term) or ternary (short-term, medium-term, and long-term) classification task. Extensive benchmarking experiments demonstrate that the CGSignature achieves outstanding model performance, with Area Under the Receiver Operating Characteristic curve of 0.960 +/- 0.01, and 0.771+/-0.024 to 0.904+/-0.012 for the binary- and ternary-classification, respectively. Moreover, Kaplan-Meier survival analysis indicates that the 'digital grade' cancer staging produced by CGSignature provides a remarkable capability in discriminating both binary and ternary classes with statistical significance (P value < 0.0001), significantly outperforming the AJCC 8th edition Tumor Node Metastasis staging system. Using Cell-Graphs extracted from mIHC images, CGSignature improves the assessment of the link between the TME spatial patterns and patient prognosis. Our study suggests the feasibility and benefits of such an artificial intelligence-powered digital staging system in diagnostic pathology and precision oncology.

Li, F., Dong, S., Leier, A., Han, M., Guo, X., Xu, J., Wang, X., Pan, S., Jia, C., Zhang, Y., Webb, G. I., Coin, L. J. M., Li, C., & Song, J.
Briefings in Bioinformatics, 23(1), Art. no. bbab461, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbab461,
author = {Li, Fuyi and Dong, Shuangyu and Leier, Andre and Han, Meiya and Guo, Xudong and Xu, Jing and Wang, Xiaoyu and Pan, Shirui and Jia, Cangzhi and Zhang, Yang and Webb, Geoffrey I and Coin, Lachlan J M and Li, Chen and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Positive-unlabeled learning in bioinformatics and computational biology: a brief review},
year = {2022},
issn = {1477-4054},
number = {1},
volume = {23},
abstract = {{Conventional supervised binary classification algorithms have been widely applied to address significant research questions using biological and biomedical data. This classification scheme requires two fully labeled classes of data (e.g. positive and negative samples) to train a classification model. However, in many bioinformatics applications, labeling data is laborious, and the negative samples might be potentially mislabeled due to the limited sensitivity of the experimental equipment. The positive unlabeled (PU) learning scheme was therefore proposed to enable the classifier to learn directly from limited positive samples and a large number of unlabeled samples (i.e. a mixture of positive or negative samples). To date, several PU learning algorithms have been developed to address various biological questions, such as sequence identification, functional site characterization and interaction prediction. In this paper, we revisit a collection of 29 state-of-the-art PU learning bioinformatic applications to address various biological questions. Various important aspects are extensively discussed, including PU learning methodology, biological application, classifier design and evaluation strategy. We also comment on the existing issues of PU learning and offer our perspectives for the future development of PU learning applications. We anticipate that our work serves as an instrumental guideline for a better understanding of the PU learning framework in bioinformatics and further developing next-generation PU learning frameworks for critical biological applications.}},
articlenumber = {bbab461},
doi = {10.1093/bib/bbab461},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Conventional supervised binary classification algorithms have been widely applied to address significant research questions using biological and biomedical data. This classification scheme requires two fully labeled classes of data (e.g. positive and negative samples) to train a classification model. However, in many bioinformatics applications, labeling data is laborious, and the negative samples might be potentially mislabeled due to the limited sensitivity of the experimental equipment. The positive unlabeled (PU) learning scheme was therefore proposed to enable the classifier to learn directly from limited positive samples and a large number of unlabeled samples (i.e. a mixture of positive or negative samples). To date, several PU learning algorithms have been developed to address various biological questions, such as sequence identification, functional site characterization and interaction prediction. In this paper, we revisit a collection of 29 state-of-the-art PU learning bioinformatic applications to address various biological questions. Various important aspects are extensively discussed, including PU learning methodology, biological application, classifier design and evaluation strategy. We also comment on the existing issues of PU learning and offer our perspectives for the future development of PU learning applications. We anticipate that our work serves as an instrumental guideline for a better understanding of the PU learning framework in bioinformatics and further developing next-generation PU learning frameworks for critical biological applications.}

Wang, Y., Hu, C., Kwok, T., Bain, C. A., Xue, X., Gasser, R. B., Webb, G. I., Boussioutas, A., Shen, X., Daly, R. J., & Song, J.
Bioinformatics, 38(17), 4206–4213, 2022.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Wang_2022,
author = {Yanan Wang and Changyuan Hu and Terry Kwok and Christopher A Bain and Xiangyang Xue and Robin B Gasser and Geoffrey I Webb and Alex Boussioutas and Xian Shen and Roger J Daly and Jiangning Song},
journal = {Bioinformatics},
title = {{DEMoS}: A Deep Learning-based Ensemble Approach for Predicting the Molecular Subtypes of Gastric Adenocarcinomas from Histopathological Images},
year = {2022},
number = {17},
pages = {4206–4213},
volume = {38},
doi = {10.1093/bioinformatics/btac456},
editor = {Hanchuan Peng},
keywords = {Bioinformatics},
publisher = {Oxford University Press ({OUP})},
related = {computational-biology},
}
ABSTRACT 

Manapragada, C., Gomes, H. M., Salehi, M., Bifet, A., & Webb, G. I.
Data Mining and Knowledge Discovery, 36, 566-619, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Manapragada_2022,
author = {Chaitanya Manapragada and Heitor M. Gomes and Mahsa Salehi and Albert Bifet and Geoffrey I. Webb},
journal = {Data Mining and Knowledge Discovery},
title = {An eager splitting strategy for online decision trees in ensembles},
year = {2022},
pages = {566-619},
volume = {36},
abstract = {Decision tree ensembles are widely used in practice. In this work, we study in ensemble settings the effectiveness of replacing the split strategy for the state-of-the-art online tree learner, Hoeffding Tree, with a rigorous but more eager splitting strategy that we had previously published as Hoeffding AnyTime Tree. Hoeffding AnyTime Tree (HATT), uses the Hoeffding Test to determine whether the current best candidate split is superior to the current split, with the possibility of revision, while Hoeffding Tree aims to determine whether the top candidate is better than the second best and if a test is selected, fixes it for all posterity. HATT converges to the ideal batch tree while Hoeffding Tree does not. We find that HATT is an efficacious base learner for online bagging and online boosting ensembles. On UCI and synthetic streams, HATT as a base learner outperforms HT at a 0.05 significance level for the majority of tested ensembles on what we believe is the largest and most comprehensive set of testbenches in the online learning literature. Our results indicate that HATT is a superior alternative to Hoeffding Tree in a large number of ensemble settings.},
doi = {10.1007/s10618-021-00816-x},
keywords = {Concept Drift},
publisher = {Springer Science and Business Media {LLC}},
related = {learning-from-non-stationary-distributions},
url = {https://rdcu.be/c1y4Z},
}
ABSTRACT Decision tree ensembles are widely used in practice. In this work, we study in ensemble settings the effectiveness of replacing the split strategy for the state-of-the-art online tree learner, Hoeffding Tree, with a rigorous but more eager splitting strategy that we had previously published as Hoeffding AnyTime Tree. Hoeffding AnyTime Tree (HATT), uses the Hoeffding Test to determine whether the current best candidate split is superior to the current split, with the possibility of revision, while Hoeffding Tree aims to determine whether the top candidate is better than the second best and if a test is selected, fixes it for all posterity. HATT converges to the ideal batch tree while Hoeffding Tree does not. We find that HATT is an efficacious base learner for online bagging and online boosting ensembles. On UCI and synthetic streams, HATT as a base learner outperforms HT at a 0.05 significance level for the majority of tested ensembles on what we believe is the largest and most comprehensive set of testbenches in the online learning literature. Our results indicate that HATT is a superior alternative to Hoeffding Tree in a large number of ensemble settings.

Zhang, M., Jia, C., Li, F., Li, C., Zhu, Y., Akutsu, T., Webb, G. I., Zou, Q., Coin, L. J. M., & Song, J.
Briefings in Bioinformatics, 23, Art. no. bbab551, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbab551,
author = {Zhang, Meng and Jia, Cangzhi and Li, Fuyi and Li, Chen and Zhu, Yan and Akutsu, Tatsuya and Webb, Geoffrey I and Zou, Quan and Coin, Lachlan J M and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Critical assessment of computational tools for prokaryotic and eukaryotic promoter prediction},
year = {2022},
issn = {1477-4054},
volume = {23},
abstract = {{Promoters are crucial regulatory DNA regions for gene transcriptional activation. Rapid advances in next-generation sequencing technologies have accelerated the accumulation of genome sequences, providing increased training data to inform computational approaches for both prokaryotic and eukaryotic promoter prediction. However, it remains a significant challenge to accurately identify species-specific promoter sequences using computational approaches. To advance computational support for promoter prediction, in this study, we curated 58 comprehensive, up-to-date, benchmark datasets for 7 different species (i.e. Escherichia coli, Bacillus subtilis, Homo sapiens, Mus musculus, Arabidopsis thaliana, Zea mays and Drosophila melanogaster) to assist the research community to assess the relative functionality of alternative approaches and support future research on both prokaryotic and eukaryotic promoters. We revisited 106 predictors published since 2000 for promoter identification (40 for prokaryotic promoter, 61 for eukaryotic promoter, and 5 for both). We systematically evaluated their training datasets, computational methodologies, calculated features, performance and software usability. On the basis of these benchmark datasets, we benchmarked 19 predictors with functioning webservers/local tools and assessed their prediction performance. We found that deep learning and traditional machine learning–based approaches generally outperformed scoring function–based approaches. Taken together, the curated benchmark dataset repository and the benchmarking analysis in this study serve to inform the design and implementation of computational approaches for promoter prediction and facilitate more rigorous comparison of new techniques in the future.}},
articlenumber = {bbab551},
doi = {10.1093/bib/bbab551},
issue = {2},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Promoters are crucial regulatory DNA regions for gene transcriptional activation. Rapid advances in next-generation sequencing technologies have accelerated the accumulation of genome sequences, providing increased training data to inform computational approaches for both prokaryotic and eukaryotic promoter prediction. However, it remains a significant challenge to accurately identify species-specific promoter sequences using computational approaches. To advance computational support for promoter prediction, in this study, we curated 58 comprehensive, up-to-date, benchmark datasets for 7 different species (i.e. Escherichia coli, Bacillus subtilis, Homo sapiens, Mus musculus, Arabidopsis thaliana, Zea mays and Drosophila melanogaster) to assist the research community to assess the relative functionality of alternative approaches and support future research on both prokaryotic and eukaryotic promoters. We revisited 106 predictors published since 2000 for promoter identification (40 for prokaryotic promoter, 61 for eukaryotic promoter, and 5 for both). We systematically evaluated their training datasets, computational methodologies, calculated features, performance and software usability. On the basis of these benchmark datasets, we benchmarked 19 predictors with functioning webservers/local tools and assessed their prediction performance. We found that deep learning and traditional machine learning–based approaches generally outperformed scoring function–based approaches. Taken together, the curated benchmark dataset repository and the benchmarking analysis in this study serve to inform the design and implementation of computational approaches for promoter prediction and facilitate more rigorous comparison of new techniques in the future.}

Manapragada, C., Webb, G. I., & Salehi, M.
Proceedings of the 2022 IEEE International Conference on Data Mining (ICDM), pp. 319-328, 2022.
ICDM-2022 Best Paper Runner-up Award
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ManapragadaEtAl22,
author = {Manapragada, Chaitanya and Webb, Geoffrey I. and Salehi, Mahsa},
booktitle = {Proceedings of the 2022 IEEE International Conference on Data Mining (ICDM)},
title = {Extremely Fast Hoeffding Adaptive Tree},
year = {2022},
pages = {319-328},
publisher = {IEEE},
abstract = {Many real-world data streams are non-stationary.
Subject to concept drift, the distributions change over time.
To retain accuracy in the face of such drift, online decision
tree learners must discard parts of the tree that are no longer
accurate and replace them by new subtrees that reflect the
new distribution. The longstanding state-of-the-art online
decision tree learner for non-stationary streams is Hoeffding
mechanism to the classic Very Fast Decision Tree (VFDT) online
decision tree learner. However, for stationary distributions,
VFDT has been superseded by Extremely Fast Decision Tree
(EFDT), which uses a statistically more efficient learning
mechanism than VFDT. This learning mechanism needs to be
coupled with a compensatory revision mechanism that can
compensate for circumstances where the learning mechanism is
too eager.
The current work develops a strategy to combine the best
of both these state-of-the-art approaches, exploiting both the
statistically efficient learning mechanism from EFDT and the
highly effective drift detection and response mechanism of HAT.
To do so requires decoupling of the EFDT splitting and revision
mechanisms, as the latter incorrectly triggers the HAT drift
detection mechanism. The resulting learner, Extremely Fast
Hoeffding Adaptive Tree, responds to drift more rapidly and
effectively than either HAT or EFDT, and attains a statistically
significant advantage in accuracy even on stationary streams.},
comment = {ICDM-2022 Best Paper Runner-up Award},
doi = {10.1109/ICDM54844.2022.00042},
keywords = {Concept Drift},
related = {learning-from-non-stationary-distributions},
}
ABSTRACT Many real-world data streams are non-stationary. Subject to concept drift, the distributions change over time. To retain accuracy in the face of such drift, online decision tree learners must discard parts of the tree that are no longer accurate and replace them by new subtrees that reflect the new distribution. The longstanding state-of-the-art online decision tree learner for non-stationary streams is Hoeffding Adaptive Tree (HAT), which adds a drift detection and response mechanism to the classic Very Fast Decision Tree (VFDT) online decision tree learner. However, for stationary distributions, VFDT has been superseded by Extremely Fast Decision Tree (EFDT), which uses a statistically more efficient learning mechanism than VFDT. This learning mechanism needs to be coupled with a compensatory revision mechanism that can compensate for circumstances where the learning mechanism is too eager. The current work develops a strategy to combine the best of both these state-of-the-art approaches, exploiting both the statistically efficient learning mechanism from EFDT and the highly effective drift detection and response mechanism of HAT. To do so requires decoupling of the EFDT splitting and revision mechanisms, as the latter incorrectly triggers the HAT drift detection mechanism. The resulting learner, Extremely Fast Hoeffding Adaptive Tree, responds to drift more rapidly and effectively than either HAT or EFDT, and attains a statistically significant advantage in accuracy even on stationary streams.

Jung, M., Lukose, D., Nielsen, S., Bell, S. J., Webb, G. I., & Ilomaki, J.
Pharmacoepidemiology and Drug Safety, 31(S2 ABSTRACTS of ICPE 2022, the 38th International Conference on Pharmacoepidemiology and Therapeutic Risk Management), 3-628, 2022.
[Bibtex]  → Access on publisher site

@Article{JungEtAl22A,
author = {Jung, Monica and Lukose, Dickson and Nielsen, Suzanne and Bell, J. Simon and Webb, Geoffrey I. and Ilomaki, Jenni},
journal = {Pharmacoepidemiology and Drug Safety},
title = {Incidence and prevalence of prescription opioid use during Australian COVID-19 restrictions},
year = {2022},
number = {S2 ABSTRACTS of ICPE 2022, the 38th International Conference on Pharmacoepidemiology and Therapeutic Risk Management},
pages = {3-628},
volume = {31},
creationdate = {2022-12-15T16:36:58},
doi = {10.1002/pds.5518},
}
ABSTRACT 

Livori, A. C., Lukose, D., Bell, S. J., Webb, G. I., & Ilomäki, J.
Current Problems in Cardiology, 101576, 2022.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Livori2022,
author = {Adam C Livori and Dickson Lukose and J Simon Bell and Geoffrey I Webb and Jenni Ilomäki},
journal = {Current Problems in Cardiology},
title = {Did Australia's COVID-19 restrictions impact statin incidence, prevalence or adherence?},
year = {2022},
issn = {0146-2806},
pages = {101576},
abstract = {Objective
COVID-19 restrictions may have an unintended consequence of limiting access to cardiovascular care. Australia implemented adaptive interventions (e.g. telehealth consultations, digital image prescriptions, continued dispensing, medication delivery) to maintain medication access. This study investigated whether COVID-19 restrictions in different jurisdictions coincided with changes in statin incidence, prevalence and adherence.
Methods
Analysis of a 10% random sample of national medication claims data from January 2018 to December 2020 was conducted across three Australian jurisdictions. Weekly incidence and prevalence were estimated by dividing the number statin initiations and any statin dispensing by the Australian population aged 18-99 years. Statin adherence was analysed across the jurisdictions and years, with adherence categorised as <40%, 40-79% and ≥80% based on dispensings per calendar year.
Results
Overall, 309,123, 315,703 and 324,906 people were dispensed and 39029, 39816, and 44979 initiated statins in 2018, 2019 and 2020 respectively. Two waves of COVID-19 restrictions in 2020 coincided with no meaningful change in statin incidence or prevalence per week when compared to 2018 and 2019. Incidence increased 0.3% from 23.7 to 26.2 per 1000 people across jurisdictions in 2020 compared to 2019. Prevalence increased 0.14% from 158.5 to 159.9 per 1000 people across jurisdictions in 2020 compared to 2019. The proportion of adults with ≥80% adherence increased by 3.3% in Victoria, 1.4% in NSW and 1.8% in other states and territories between 2019 and 2020.
Conclusions
COVID-19 restrictions did not coincide with meaningful changes in the incidence, prevalence or adherence to statins suggesting adaptive interventions succeeded in maintaining access to cardiovascular medications.},
doi = {10.1016/j.cpcardiol.2022.101576},
keywords = {Statin, drug utilisation, medication adherence, cardiovascular, cardiology, health},
related = {health},
}
ABSTRACT Objective COVID-19 restrictions may have an unintended consequence of limiting access to cardiovascular care. Australia implemented adaptive interventions (e.g. telehealth consultations, digital image prescriptions, continued dispensing, medication delivery) to maintain medication access. This study investigated whether COVID-19 restrictions in different jurisdictions coincided with changes in statin incidence, prevalence and adherence. Methods Analysis of a 10% random sample of national medication claims data from January 2018 to December 2020 was conducted across three Australian jurisdictions. Weekly incidence and prevalence were estimated by dividing the number statin initiations and any statin dispensing by the Australian population aged 18-99 years. Statin adherence was analysed across the jurisdictions and years, with adherence categorised as <40%, 40-79% and ≥80% based on dispensings per calendar year. Results Overall, 309,123, 315,703 and 324,906 people were dispensed and 39029, 39816, and 44979 initiated statins in 2018, 2019 and 2020 respectively. Two waves of COVID-19 restrictions in 2020 coincided with no meaningful change in statin incidence or prevalence per week when compared to 2018 and 2019. Incidence increased 0.3% from 23.7 to 26.2 per 1000 people across jurisdictions in 2020 compared to 2019. Prevalence increased 0.14% from 158.5 to 159.9 per 1000 people across jurisdictions in 2020 compared to 2019. The proportion of adults with ≥80% adherence increased by 3.3% in Victoria, 1.4% in NSW and 1.8% in other states and territories between 2019 and 2020. Conclusions COVID-19 restrictions did not coincide with meaningful changes in the incidence, prevalence or adherence to statins suggesting adaptive interventions succeeded in maintaining access to cardiovascular medications.

Tan, C. W., Herrmann, M., & Webb, G. I.
IEEE International Conference on Data Mining (ICDM-21), pp. 589-598, 2021.
[Bibtex]  → Access on publisher site  → Related papers and software

@InProceedings{TanEtAlUltraFast2021,
author = {Tan, Chang Wei and Herrmann, Matthieu and Webb, Geoffrey I.},
booktitle = {IEEE International Conference on Data Mining (ICDM-21)},
title = {Ultra fast warping window optimization for Dynamic Time Warping},
year = {2021},
pages = {589-598},
doi = {10.1109/ICDM51629.2021.00070},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://changweitan.com/research/UltraFastWWSearch.pdf},
}
ABSTRACT 

Godahewa, R., Bandara, K., Webb, G. I., Smyl, S., & Bergmeir, C.
Knowledge-Based Systems, 233, Art. no. 107518, 2021.
[Bibtex] [Abstract]  → Access on publisher site

@Article{GODAHEWA2021107518,
author = {Rakshitha Godahewa and Kasun Bandara and Geoffrey I. Webb and Slawek Smyl and Christoph Bergmeir},
journal = {Knowledge-Based Systems},
title = {Ensembles of localised models for time series forecasting},
year = {2021},
issn = {0950-7051},
volume = {233},
abstract = {With large quantities of data typically available nowadays, forecasting models that are trained across sets of time series, known as Global Forecasting Models (GFM), are regularly outperforming traditional univariate forecasting models that work on isolated series. As GFMs usually share the same set of parameters across all time series, they often have the problem of not being localised enough to a particular series, especially in situations where datasets are heterogeneous. We study how ensembling techniques can be used with generic GFMs and univariate models to solve this issue. Our work systematises and compares relevant current approaches, namely clustering series and training separate submodels per cluster, the so-called ensemble of specialists approach, and building heterogeneous ensembles of global and local models. We fill some gaps in the existing GFM localisation approaches, in particular by incorporating varied clustering techniques such as feature-based clustering, distance-based clustering and random clustering, and generalise them to use different underlying GFM model types. We then propose a new methodology of clustered ensembles where we train multiple GFMs on different clusters of series, obtained by changing the number of clusters and cluster seeds. Using Feed-forward Neural Networks, Recurrent Neural Networks, and Pooled Regression models as the underlying GFMs, in our evaluation on eight publicly available datasets, the proposed models are able to achieve significantly higher accuracy than baseline GFM models and univariate forecasting methods.},
articlenumber = {107518},
doi = {10.1016/j.knosys.2021.107518},
keywords = {Time series forecasting, Feed-forward Neural Networks, Recurrent Neural Networks, Pooled Regression, Ensemble models},
}
ABSTRACT With large quantities of data typically available nowadays, forecasting models that are trained across sets of time series, known as Global Forecasting Models (GFM), are regularly outperforming traditional univariate forecasting models that work on isolated series. As GFMs usually share the same set of parameters across all time series, they often have the problem of not being localised enough to a particular series, especially in situations where datasets are heterogeneous. We study how ensembling techniques can be used with generic GFMs and univariate models to solve this issue. Our work systematises and compares relevant current approaches, namely clustering series and training separate submodels per cluster, the so-called ensemble of specialists approach, and building heterogeneous ensembles of global and local models. We fill some gaps in the existing GFM localisation approaches, in particular by incorporating varied clustering techniques such as feature-based clustering, distance-based clustering and random clustering, and generalise them to use different underlying GFM model types. We then propose a new methodology of clustered ensembles where we train multiple GFMs on different clusters of series, obtained by changing the number of clusters and cluster seeds. Using Feed-forward Neural Networks, Recurrent Neural Networks, and Pooled Regression models as the underlying GFMs, in our evaluation on eight publicly available datasets, the proposed models are able to achieve significantly higher accuracy than baseline GFM models and univariate forecasting methods.

Herrmann, M., & Webb, G. I.
Data Mining and Knowledge Discovery, 35(6), 2577-2601, 2021.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Herrmann_2021,
author = {Matthieu Herrmann and Geoffrey I. Webb},
journal = {Data Mining and Knowledge Discovery},
title = {Early abandoning and pruning for elastic distances including dynamic time warping},
year = {2021},
number = {6},
pages = {2577-2601},
volume = {35},
doi = {10.1007/s10618-021-00782-4},
keywords = {time series},
publisher = {Springer Science and Business Media {LLC}},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/cuoN0},
}
ABSTRACT 

Wang, Y., Li, F., Bharathwaj, M., Rosas, N. C., Leier, A., Akutsu, T., Webb, G. I., Marquez-Lago, T. T., Li, J., Lithgow, T., & Song, J.
Briefings in Bioinformatics, 22(4), Art. no. bbaa301, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Wang2020,
author = {Yanan Wang and Fuyi Li and Manasa Bharathwaj and Natalia C Rosas and Andr{\'{e}} Leier and Tatsuya Akutsu and Geoffrey I Webb and Tatiana T Marquez-Lago and Jian Li and Trevor Lithgow and Jiangning Song},
journal = {Briefings in Bioinformatics},
title = {{DeepBL}: a deep learning-based approach for in silico discovery of beta-lactamases},
year = {2021},
number = {4},
volume = {22},
abstract = {Beta-lactamases (BLs) are enzymes localized in the periplasmic space of bacterial pathogens, where they confer resistance to beta-lactam antibiotics. Experimental identification of BLs is costly yet crucial to understand beta-lactam resistance mechanisms. To address this issue, we present DeepBL, a deep learning-based approach by incorporating sequence-derived features to enable high-throughput prediction of BLs. Specifically, DeepBL is implemented based on the Small VGGNet architecture and the TensorFlow deep learning library. Furthermore, the performance of DeepBL models is investigated in relation to the sequence redundancy level and negative sample selection in the benchmark dataset. The models are trained on datasets of varying sequence redundancy thresholds, and the model performance is evaluated by extensive benchmarking tests. Using the optimized DeepBL model, we perform proteome-wide screening for all reviewed bacterium protein sequences available from the UniProt database. These results are freely accessible at the DeepBL webserver at http://deepbl.erc.monash.edu.au/.},
articlenumber = {bbaa301},
doi = {10.1093/bib/bbaa301},
keywords = {Bioinformatics},
publisher = {Oxford University Press ({OUP})},
related = {computational-biology},
}
ABSTRACT Beta-lactamases (BLs) are enzymes localized in the periplasmic space of bacterial pathogens, where they confer resistance to beta-lactam antibiotics. Experimental identification of BLs is costly yet crucial to understand beta-lactam resistance mechanisms. To address this issue, we present DeepBL, a deep learning-based approach by incorporating sequence-derived features to enable high-throughput prediction of BLs. Specifically, DeepBL is implemented based on the Small VGGNet architecture and the TensorFlow deep learning library. Furthermore, the performance of DeepBL models is investigated in relation to the sequence redundancy level and negative sample selection in the benchmark dataset. The models are trained on datasets of varying sequence redundancy thresholds, and the model performance is evaluated by extensive benchmarking tests. Using the optimized DeepBL model, we perform proteome-wide screening for all reviewed bacterium protein sequences available from the UniProt database. These results are freely accessible at the DeepBL webserver at http://deepbl.erc.monash.edu.au/.

Zhu, L., Webb, G. I., Yebra, M., Scortechini, G., Miller, L., & Petitjean, F.
ISPRS Journal of Photogrammetry and Remote Sensing, 179, 81-91, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ZHU202181,
author = {Liujun Zhu and Geoffrey I. Webb and Marta Yebra and Gianluca Scortechini and Lynn Miller and Francois Petitjean},
journal = {ISPRS Journal of Photogrammetry and Remote Sensing},
title = {Live fuel moisture content estimation from MODIS: A deep learning approach},
year = {2021},
issn = {0924-2716},
pages = {81-91},
volume = {179},
abstract = {Live fuel moisture content (LFMC) is an essential variable to model fire danger and behaviour. This paper presents the first application of deep learning to LFMC estimation based on the historical LFMC ground samples of the Globe-LFMC database, as a step towards operational daily LFMC mapping in the Contiguous United States (CONUS). One-year MODerate resolution Imaging Spectroradiometer (MODIS) time series preceding each LFMC sample were extracted as the primary data source for training. The proposed temporal convolutional neural network for LFMC (TempCNN-LFMC) comprises three 1-D convolutional layers that learn the multi-scale temporal dynamics (features) of one-year MODIS time series specific to LFMC estimation. The learned features, together with a few auxiliary variables (e.g., digital elevation model), are then passed to three fully connected layers to extract the non-linear relationships with LFMC. In the primary training and validation scenario, the neural network was trained using samples from 2002 to 2013 and then adopted to estimating the LFMC from 2014 to 2018, achieving an overall root mean square error (RMSE) of 25.57% and a correlation coefficient (R) of 0.74. Good consistency on spatial patterns and temporal trends of accuracy was observed. The trained model achieved a similar RMSE of 25.98%, 25.20% and 25.93% for forest, shrubland, and grassland, respectively, without requiring prior information on the vegetation type.},
doi = {10.1016/j.isprsjprs.2021.07.010},
keywords = {time series, Live fuel moisture content, earth observation analytics, MODIS, Convolutional neural network, Time series analysis, Fire risk, Fire danger},
related = {earth-observation-analytics},
}
ABSTRACT Live fuel moisture content (LFMC) is an essential variable to model fire danger and behaviour. This paper presents the first application of deep learning to LFMC estimation based on the historical LFMC ground samples of the Globe-LFMC database, as a step towards operational daily LFMC mapping in the Contiguous United States (CONUS). One-year MODerate resolution Imaging Spectroradiometer (MODIS) time series preceding each LFMC sample were extracted as the primary data source for training. The proposed temporal convolutional neural network for LFMC (TempCNN-LFMC) comprises three 1-D convolutional layers that learn the multi-scale temporal dynamics (features) of one-year MODIS time series specific to LFMC estimation. The learned features, together with a few auxiliary variables (e.g., digital elevation model), are then passed to three fully connected layers to extract the non-linear relationships with LFMC. In the primary training and validation scenario, the neural network was trained using samples from 2002 to 2013 and then adopted to estimating the LFMC from 2014 to 2018, achieving an overall root mean square error (RMSE) of 25.57% and a correlation coefficient (R) of 0.74. Good consistency on spatial patterns and temporal trends of accuracy was observed. The trained model achieved a similar RMSE of 25.98%, 25.20% and 25.93% for forest, shrubland, and grassland, respectively, without requiring prior information on the vegetation type.

Wang, Y., Yang, L., Webb, G. I., Ge, Z., & Song, J.
Bioinformatics, 37(21), 3986–3988, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bioinformatics/btab416,
author = {Wang, Yanan and Yang, Litao and Webb, Geoffrey I and Ge, Zongyuan and Song, Jiangning},
journal = {Bioinformatics},
title = {{OCTID}: a one-class learning-based {Python} package for tumor image detection},
year = {2021},
issn = {1367-4803},
number = {21},
pages = {3986–3988},
volume = {37},
abstract = {{Tumor tile selection is a necessary prerequisite in patch-based cancer whole slide image analysis, which is labor-intensive and requires expertise. Whole slides are annotated as tumor or tumor free, but tiles within a tumor slide are not. As all tiles within a tumor free slide are tumor free, these can be used to capture tumor-free patterns using the one-class learning strategy. We present a Python package, termed OCTID, which combines a pretrained convolutional neural network (CNN) model, Uniform Manifold Approximation and Projection (UMAP) and one-class support vector machine to achieve accurate tumor tile classification using a training set of tumor free tiles. Benchmarking experiments on four H&E image datasets achieved remarkable performance in terms of F1-score (0.90?+/-0.06), Matthews correlation coefficient (0.93?+/-0.05) and accuracy (0.94?+/-0.03).Detailed information can be found in the Supplementary File.Supplementary data are available at Bioinformatics online.}},
doi = {10.1093/bioinformatics/btab416},
keywords = {health},
related = {health},
}
ABSTRACT {Tumor tile selection is a necessary prerequisite in patch-based cancer whole slide image analysis, which is labor-intensive and requires expertise. Whole slides are annotated as tumor or tumor free, but tiles within a tumor slide are not. As all tiles within a tumor free slide are tumor free, these can be used to capture tumor-free patterns using the one-class learning strategy. We present a Python package, termed OCTID, which combines a pretrained convolutional neural network (CNN) model, Uniform Manifold Approximation and Projection (UMAP) and one-class support vector machine to achieve accurate tumor tile classification using a training set of tumor free tiles. Benchmarking experiments on four H&E image datasets achieved remarkable performance in terms of F1-score (0.90?+/-0.06), Matthews correlation coefficient (0.93?+/-0.05) and accuracy (0.94?+/-0.03).Detailed information can be found in the Supplementary File.Supplementary data are available at Bioinformatics online.}

Mei, S., Li, F., Xiang, D., Ayala, R., Faridi, P., Webb, G. I., Illing, P. T., Rossjohn, J., Akutsu, T., Croft, N. P., Purcell, A. W., & Song, J.
Briefings in Bioinformatics, 22, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Mei2021,
author = {Mei, Shutao and Li, Fuyi and Xiang, Dongxu and Ayala, Rochelle and Faridi, Pouya and Webb, Geoffrey I and Illing, Patricia T and Rossjohn, Jamie and Akutsu, Tatsuya and Croft, Nathan P and Purcell, Anthony W and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {{Anthem: a user customised tool for fast and accurate prediction of binding between peptides and HLA class I molecules}},
year = {2021},
issn = {1477-4054},
volume = {22},
abstract = {{Neopeptide-based immunotherapy has been recognised as a promising approach for the treatment of cancers. For neopeptides to be recognised by CD8+ T cells and induce an immune response, their binding to human leukocyte antigen class I (HLA-I) molecules is a necessary first step. Most epitope prediction tools thus rely on the prediction of such binding. With the use of mass spectrometry, the scale of naturally presented HLA ligands that could be used to develop such predictors has been expanded. However, there are rarely efforts that focus on the integration of these experimental data with computational algorithms to efficiently develop up-to-date predictors. Here, we present Anthem for accurate HLA-I binding prediction. In particular, we have developed a user-friendly framework to support the development of customisable HLA-I binding prediction models to meet challenges associated with the rapidly increasing availability of large amounts of immunopeptidomic data. Our extensive evaluation, using both independent and experimental datasets shows that Anthem achieves an overall similar or higher area under curve value compared with other contemporary tools. It is anticipated that Anthem will provide a unique opportunity for the non-expert user to analyse and interpret their own in-house or publicly deposited datasets.}},
doi = {10.1093/bib/bbaa415},
issue = {5},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT {Neopeptide-based immunotherapy has been recognised as a promising approach for the treatment of cancers. For neopeptides to be recognised by CD8+ T cells and induce an immune response, their binding to human leukocyte antigen class I (HLA-I) molecules is a necessary first step. Most epitope prediction tools thus rely on the prediction of such binding. With the use of mass spectrometry, the scale of naturally presented HLA ligands that could be used to develop such predictors has been expanded. However, there are rarely efforts that focus on the integration of these experimental data with computational algorithms to efficiently develop up-to-date predictors. Here, we present Anthem for accurate HLA-I binding prediction. In particular, we have developed a user-friendly framework to support the development of customisable HLA-I binding prediction models to meet challenges associated with the rapidly increasing availability of large amounts of immunopeptidomic data. Our extensive evaluation, using both independent and experimental datasets shows that Anthem achieves an overall similar or higher area under curve value compared with other contemporary tools. It is anticipated that Anthem will provide a unique opportunity for the non-expert user to analyse and interpret their own in-house or publicly deposited datasets.}

Dempster, A., Schmidt, D. F., & Webb, G. I.
Proceedings of the 27th SIGKDD Conference on Knowledge Discovery and Data Mining, pp. 248-257, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{dempsteretal21kdd,
author = {Angus Dempster and Daniel F. Schmidt and Geoffrey I. Webb},
booktitle = {Proceedings of the 27th SIGKDD Conference on Knowledge Discovery and Data Mining},
title = {{MINIROCKET:} {A} Very Fast (Almost) Deterministic Transform for Time Series Classification},
year = {2021},
pages = {248-257},
abstract = {Until recently, the most accurate methods for time series classification were limited by high computational complexity. ROCKET achieves state-of-the-art accuracy with a fraction of the computational expense of most existing methods by transforming input time series using random convolutional kernels, and using the transformed features to train a linear classifier. We reformulate ROCKET into a new method, MINIROCKET, making it up to 75 times faster on larger datasets, and making it almost deterministic (and optionally, with additional computational expense, fully deterministic), while maintaining essentially the same accuracy. Using this method, it is possible to train and test a classifier on all of 109 datasets from the UCR archive to state-of-the-art accuracy in less than 10 minutes. MINIROCKET is significantly faster than any other method of comparable accuracy (including ROCKET), and significantly more accurate than any other method of even roughly-similar computational expense. As such, we suggest that MINIROCKET should now be considered and used as the default variant of ROCKET.},
doi = {10.1145/3447548.3467231},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://arxiv.org/abs/2012.08791},
}
ABSTRACT Until recently, the most accurate methods for time series classification were limited by high computational complexity. ROCKET achieves state-of-the-art accuracy with a fraction of the computational expense of most existing methods by transforming input time series using random convolutional kernels, and using the transformed features to train a linear classifier. We reformulate ROCKET into a new method, MINIROCKET, making it up to 75 times faster on larger datasets, and making it almost deterministic (and optionally, with additional computational expense, fully deterministic), while maintaining essentially the same accuracy. Using this method, it is possible to train and test a classifier on all of 109 datasets from the UCR archive to state-of-the-art accuracy in less than 10 minutes. MINIROCKET is significantly faster than any other method of comparable accuracy (including ROCKET), and significantly more accurate than any other method of even roughly-similar computational expense. As such, we suggest that MINIROCKET should now be considered and used as the default variant of ROCKET.

Wang, Y., Coudray, N., Zhao, Y., Li, F., Hu, C., Zhang, Y., Imoto, S., Tsirigos, A., Webb, G. I., Daly, R. J., & Song, J.
Bioinformatics, 37(22), 4291-4295, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Wang2021,
author = {Wang, Yanan and Coudray, Nicolas and Zhao, Yun and Li, Fuyi and Hu, Changyuan and Zhang, Yao-Zhong and Imoto, Seiya and Tsirigos, Aristotelis and Webb, Geoffrey I and Daly, Roger J and Song, Jiangning},
journal = {Bioinformatics},
title = {{HEAL}: an automated deep learning framework for cancer histopathology image analysis},
year = {2021},
number = {22},
pages = {4291-4295},
volume = {37},
abstract = {{Digital pathology supports analysis of histopathological images using deep learning methods at a large-scale. However, applications of deep learning in this area have been limited by the complexities of configuration of the computational environment and of hyperparameter optimization, which hinder deployment and reduce reproducibility.Here, we propose HEAL, a deep learning-based automated framework for easy, flexible, and multi-faceted histopathological image analysis. We demonstrate its utility and functionality by performing two case studies on lung cancer and one on colon cancer. Leveraging the capability of Docker, HEAL represents an ideal end-to-end tool to conduct complex histopathological analysis and enables deep learning in a broad range of applications for cancer image analysis.Supplementary data are available at Bioinformatics online.}},
doi = {10.1093/bioinformatics/btab380},
keywords = {health},
publisher = {Oxford University Press ({OUP})},
related = {health},
}
ABSTRACT {Digital pathology supports analysis of histopathological images using deep learning methods at a large-scale. However, applications of deep learning in this area have been limited by the complexities of configuration of the computational environment and of hyperparameter optimization, which hinder deployment and reduce reproducibility.Here, we propose HEAL, a deep learning-based automated framework for easy, flexible, and multi-faceted histopathological image analysis. We demonstrate its utility and functionality by performing two case studies on lung cancer and one on colon cancer. Leveraging the capability of Docker, HEAL represents an ideal end-to-end tool to conduct complex histopathological analysis and enables deep learning in a broad range of applications for cancer image analysis.Supplementary data are available at Bioinformatics online.}

Boley, M., Teshuva, S., Bodic, P. L., & Webb, G. I.
Proceedings of the 2021 SIAM International Conference on Data Mining (SDM), pp. 351-359, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{boley2021better,
author = {Boley, Mario and Teshuva, Simon and Bodic, Pierre Le and Webb, Geoffrey I},
booktitle = {Proceedings of the 2021 SIAM International Conference on Data Mining (SDM)},
title = {Better Short than Greedy: Interpretable Models through Optimal Rule Boosting},
year = {2021},
organization = {SIAM},
pages = {351-359},
abstract = {Rule ensembles are designed to provide a useful trade-off between predictive accuracy and model interpretability. However, the myopic and random search components of current rule ensemble methods can compromise this goal: they often need more rules than necessary to reach a certain accuracy level or can even outright fail to accurately model a distribution that can actually be described well with a few rules. Here, we present a novel approach aiming to fit rule ensembles of maximal predictive power for a given ensemble size (and thus model comprehensibility). In particular, we present an efficient branch-and-bound algorithm that optimally solves the per-rule objective function of the popular second-order gradient boosting framework. Our main insight is that the boosting objective can be tightly bounded in linear time of the number of covered data points. Along with an additional novel pruning technique related to rule redundancy, this leads to a computationally feasible approach for boosting optimal rules that, as we demonstrate on a wide range of common benchmark problems, consistently outperforms the predictive performance of boosting greedy rules.},
keywords = {Rule Learning and OPUS and Association Rule Discovery},
related = {opus-search},
url = {https://arxiv.org/abs/2101.08380},
}
ABSTRACT Rule ensembles are designed to provide a useful trade-off between predictive accuracy and model interpretability. However, the myopic and random search components of current rule ensemble methods can compromise this goal: they often need more rules than necessary to reach a certain accuracy level or can even outright fail to accurately model a distribution that can actually be described well with a few rules. Here, we present a novel approach aiming to fit rule ensembles of maximal predictive power for a given ensemble size (and thus model comprehensibility). In particular, we present an efficient branch-and-bound algorithm that optimally solves the per-rule objective function of the popular second-order gradient boosting framework. Our main insight is that the boosting objective can be tightly bounded in linear time of the number of covered data points. Along with an additional novel pruning technique related to rule redundancy, this leads to a computationally feasible approach for boosting optimal rules that, as we demonstrate on a wide range of common benchmark problems, consistently outperforms the predictive performance of boosting greedy rules.

Chen, Z., Zhao, P., Li, C., Li, F., Xiang, D., Chen, Y., Akutsu, T., Daly, R. J., Webb, G. I., Zhao, Q., Kurgan, L., & Song, J.
Nucleic Acids Research, 49(10), Art. no. e60, 2021.
Clarivate Web of Science Highly Cited Paper 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ChenEtAl21,
author = {Chen, Zhen and Zhao, Pei and Li, Chen and Li, Fuyi and Xiang, Dongxu and Chen, Yong-Zi and Akutsu, Tatsuya and Daly, Roger J and Webb, Geoffrey I and Zhao, Quanzhi and Kurgan, Lukasz and Song, Jiangning},
journal = {Nucleic Acids Research},
title = {{iLearnPlus: a comprehensive and automated machine-learning platform for nucleic acid and protein sequence analysis, prediction and visualization}},
year = {2021},
issn = {0305-1048},
number = {10},
volume = {49},
abstract = {{Sequence-based analysis and prediction are fundamental bioinformatic tasks that facilitate understanding of the sequence(-structure)-function paradigm for DNAs, RNAs and proteins. Rapid accumulation of sequences requires equally pervasive development of new predictive models, which depends on the availability of effective tools that support these efforts. We introduce iLearnPlus, the first machine-learning platform with graphical- and web-based interfaces for the construction of machine-learning pipelines for analysis and predictions using nucleic acid and protein sequences. iLearnPlus provides a comprehensive set of algorithms and automates sequence-based feature extraction and analysis, construction and deployment of models, assessment of predictive performance, statistical analysis, and data visualization; all without programming. iLearnPlus includes a wide range of feature sets which encode information from the input sequences and over twenty machine-learning algorithms that cover several deep-learning approaches, outnumbering the current solutions by a wide margin. Our solution caters to experienced bioinformaticians, given the broad range of options, and biologists with no programming background, given the point-and-click interface and easy-to-follow design process. We showcase iLearnPlus with two case studies concerning prediction of long noncoding RNAs (lncRNAs) from RNA transcripts and prediction of crotonylation sites in protein chains. iLearnPlus is an open-source platform available at https://github.com/Superzchen/iLearnPlus/ with the webserver at http://ilearnplus.erc.monash.edu/.}},
articlenumber = {e60},
comment = {Clarivate Web of Science Highly Cited Paper 2022},
doi = {10.1093/nar/gkab122},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Sequence-based analysis and prediction are fundamental bioinformatic tasks that facilitate understanding of the sequence(-structure)-function paradigm for DNAs, RNAs and proteins. Rapid accumulation of sequences requires equally pervasive development of new predictive models, which depends on the availability of effective tools that support these efforts. We introduce iLearnPlus, the first machine-learning platform with graphical- and web-based interfaces for the construction of machine-learning pipelines for analysis and predictions using nucleic acid and protein sequences. iLearnPlus provides a comprehensive set of algorithms and automates sequence-based feature extraction and analysis, construction and deployment of models, assessment of predictive performance, statistical analysis, and data visualization; all without programming. iLearnPlus includes a wide range of feature sets which encode information from the input sequences and over twenty machine-learning algorithms that cover several deep-learning approaches, outnumbering the current solutions by a wide margin. Our solution caters to experienced bioinformaticians, given the broad range of options, and biologists with no programming background, given the point-and-click interface and easy-to-follow design process. We showcase iLearnPlus with two case studies concerning prediction of long noncoding RNAs (lncRNAs) from RNA transcripts and prediction of crotonylation sites in protein chains. iLearnPlus is an open-source platform available at https://github.com/Superzchen/iLearnPlus/ with the webserver at http://ilearnplus.erc.monash.edu/.}

Krempl, G., Hofer, V., Webb, G., & Hullermeier, E.
Schloss Dagstuhl - Leibniz-Zentrum fur Informatik, 2021.
[Bibtex]  → Access on publisher site  → Related papers and software

@TechReport{Krempl2021,
author = {Krempl, Georg and Hofer, Vera and Webb, Geoffrey and Hullermeier, Eyke},
institution = {Schloss Dagstuhl - Leibniz-Zentrum fur Informatik},
title = {Beyond Adaptation: Understanding Distributional Changes ({Dagstuhl} Seminar 20372)},
year = {2021},
doi = {10.4230/DAGREP.10.4.1},
keywords = {Statistical Machine Learning, Data Streams, Concept Drift, Non-Stationary Non-IID Data, Change Mining, Dagstuhl Seminar, Theory of computation - Machine learning theory, Mathematics of computing - Time series analysis, Computing methodologies - Multi-task learning, Computing methodologies - Learning under covariate shift, Computing methodologies - Lifelong machine learning},
related = {learning-from-non-stationary-distributions},
}
ABSTRACT 

Li, M., Wang, Y., Li, F., Zhao, Y., Liu, M., Zhang, S., Bin, Y., Smith, A. I., Webb, G., Li, J., Song, J., & Xia, J.
IEEE/ACM Trans Comput Biol Bioinform, 18, 1801-1810, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{RN3447,
author = {Li, M. and Wang, Y. and Li, F. and Zhao, Y. and Liu, M. and Zhang, S. and Bin, Y. and Smith, A. I. and Webb, G. and Li, J. and Song, J. and Xia, J.},
journal = {IEEE/ACM Trans Comput Biol Bioinform},
title = {A Deep Learning-Based Method for Identification of Bacteriophage-Host Interaction},
year = {2021},
issn = {1545-5963},
pages = {1801-1810},
volume = {18},
abstract = {Multi-drug resistance (MDR) has become one of the greatest threats to human health worldwide, and novel treatment methods of infections caused by MDR bacteria are urgently needed. Phage therapy is a promising alternative to solve this problem, to which the key is correctly matching target pathogenic bacteria with the corresponding therapeutic phage. Deep learning is powerful for mining complex patterns to generate accurate predictions. In this study, we develop PredPHI (Predicting Phage-Host Interactions), a deep learning-based tool capable of predicting the host of phages from sequence data. We collect >3000 phage-host pairs along with their protein sequences from PhagesDB and GenBank databases and extract a set of features. Then we select high-quality negative samples based on the K-Means clustering method and construct a balanced training set. Finally, we employ a deep convolutional neural network to build the predictive model. The results indicate that PredPHI can achieve a predictive performance of 81% in terms of the area under the receiver operating characteristic curve on the test set, and the clustering-based method is significantly more robust than that based on randomly selecting negative samples. These results highlight that PredPHI is a useful and accurate tool for identifying phage-host interactions from sequence data.},
doi = {10.1109/tcbb.2020.3017386},
issue = {5},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Multi-drug resistance (MDR) has become one of the greatest threats to human health worldwide, and novel treatment methods of infections caused by MDR bacteria are urgently needed. Phage therapy is a promising alternative to solve this problem, to which the key is correctly matching target pathogenic bacteria with the corresponding therapeutic phage. Deep learning is powerful for mining complex patterns to generate accurate predictions. In this study, we develop PredPHI (Predicting Phage-Host Interactions), a deep learning-based tool capable of predicting the host of phages from sequence data. We collect >3000 phage-host pairs along with their protein sequences from PhagesDB and GenBank databases and extract a set of features. Then we select high-quality negative samples based on the K-Means clustering method and construct a balanced training set. Finally, we employ a deep convolutional neural network to build the predictive model. The results indicate that PredPHI can achieve a predictive performance of 81% in terms of the area under the receiver operating characteristic curve on the test set, and the clustering-based method is significantly more robust than that based on randomly selecting negative samples. These results highlight that PredPHI is a useful and accurate tool for identifying phage-host interactions from sequence data.

Webb, G. I., & Petitjean, F.
Pattern Recognition, 115, Art. no. 107895, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WEBB2021107895,
author = {Geoffrey I. Webb and Fran\c{c}ois Petitjean},
journal = {Pattern Recognition},
title = {Tight lower bounds for Dynamic Time Warping},
year = {2021},
issn = {0031-3203},
volume = {115},
abstract = {Dynamic Time Warping (DTW) is a popular similarity measure for aligning and comparing time series. Due to DTW's high computation time, lower bounds are often employed to screen poor matches. Many alternative lower bounds have been proposed, providing a range of different trade-offs between tightness and computational efficiency. LB_KEOGH provides a useful trade-off in many applications. Two recent lower bounds, LB_IMPROVED and LB_ENHANCED, are substantially tighter than LB_KEOGH. All three have the same worst case computational complexity - linear with respect to series length and constant with respect to window size. We present four new DTW lower bounds in the same complexity class. LB_PETITJEAN is substantially tighter than LB_IMPROVED, with only modest additional computational overhead. LB_WEBB is more efficient than LB_IMPROVED, while often providing a tighter bound. LB_WEBB is always tighter than LB_KEOGH. The parameter free LB_WEBB is usually tighter than LB_ENHANCED. A parameterized variant, LB_Webb_Enhanced, is always tighter than LB_ENHANCED. A further variant, LB_WEBB*, is useful for some constrained distance functions. In extensive experiments, LB_WEBB proves to be very effective for nearest neighbor search.},
articlenumber = {107895},
doi = {10.1016/j.patcog.2021.107895},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT Dynamic Time Warping (DTW) is a popular similarity measure for aligning and comparing time series. Due to DTW's high computation time, lower bounds are often employed to screen poor matches. Many alternative lower bounds have been proposed, providing a range of different trade-offs between tightness and computational efficiency. LB_KEOGH provides a useful trade-off in many applications. Two recent lower bounds, LB_IMPROVED and LB_ENHANCED, are substantially tighter than LB_KEOGH. All three have the same worst case computational complexity - linear with respect to series length and constant with respect to window size. We present four new DTW lower bounds in the same complexity class. LB_PETITJEAN is substantially tighter than LB_IMPROVED, with only modest additional computational overhead. LB_WEBB is more efficient than LB_IMPROVED, while often providing a tighter bound. LB_WEBB is always tighter than LB_KEOGH. The parameter free LB_WEBB is usually tighter than LB_ENHANCED. A parameterized variant, LB_Webb_Enhanced, is always tighter than LB_ENHANCED. A further variant, LB_WEBB*, is useful for some constrained distance functions. In extensive experiments, LB_WEBB proves to be very effective for nearest neighbor search.

Iqbal, S., Li, F., Akutsu, T., Ascher, D. B., Webb, G. I., & Song, J.
Briefings in Bioinformatics, 22(6), Art. no. bbab184, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbab184,
author = {Iqbal, Shahid and Li, Fuyi and Akutsu, Tatsuya and Ascher, David B and Webb, Geoffrey I and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Assessing the performance of computational predictors for estimating protein stability changes upon missense mutations},
year = {2021},
issn = {1477-4054},
number = {6},
volume = {22},
abstract = {Understanding how a mutation might affect protein stability is of significant importance to protein engineering and for understanding protein evolution genetic diseases. While a number of computational tools have been developed to predict the effect of missense mutations on protein stability protein stability upon mutations, they are known to exhibit large biases imparted in part by the data used to train and evaluate them. Here, we provide a comprehensive overview of predictive tools, which has provided an evolving insight into the importance and relevance of features that can discern the effects of mutations on protein stability. A diverse selection of these freely available tools was benchmarked using a large mutation-level blind dataset of 1342 experimentally characterised mutations across 130 proteins from ThermoMutDB, a second test dataset encompassing 630 experimentally characterised mutations across 39 proteins from iStable2.0 and a third blind test dataset consisting of 268 mutations in 27 proteins from the newly published ProThermDB. The performance of the methods was further evaluated with respect to the site of mutation, type of mutant residue and by ranging the pH and temperature. Additionally, the classification performance was also evaluated by classifying the mutations as stabilizing (delta delta G>=0) or destabilizing (delta delta G<0). The results reveal that the performance of the predictors is affected by the site of mutation and the type of mutant residue. Further, the results show very low performance for pH values 6-8 and temperature higher than 65 for all predictors except iStable2.0 on the S630 dataset. To illustrate how stability and structure change upon single point mutation, we considered four stabilizing, two destabilizing and two stabilizing mutations from two proteins, namely the toxin protein and bovine liver cytochrome. Overall, the results on S268, S630 and S1342 datasets show that the performance of the integrated predictors is better than the mechanistic or individual machine learning predictors. We expect that this paper will provide useful guidance for the design and development of next-generation bioinformatic tools for predicting protein stability changes upon mutations.},
articlenumber = {bbab184},
doi = {10.1093/bib/bbab184},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT Understanding how a mutation might affect protein stability is of significant importance to protein engineering and for understanding protein evolution genetic diseases. While a number of computational tools have been developed to predict the effect of missense mutations on protein stability protein stability upon mutations, they are known to exhibit large biases imparted in part by the data used to train and evaluate them. Here, we provide a comprehensive overview of predictive tools, which has provided an evolving insight into the importance and relevance of features that can discern the effects of mutations on protein stability. A diverse selection of these freely available tools was benchmarked using a large mutation-level blind dataset of 1342 experimentally characterised mutations across 130 proteins from ThermoMutDB, a second test dataset encompassing 630 experimentally characterised mutations across 39 proteins from iStable2.0 and a third blind test dataset consisting of 268 mutations in 27 proteins from the newly published ProThermDB. The performance of the methods was further evaluated with respect to the site of mutation, type of mutant residue and by ranging the pH and temperature. Additionally, the classification performance was also evaluated by classifying the mutations as stabilizing (delta delta G>=0) or destabilizing (delta delta G<0). The results reveal that the performance of the predictors is affected by the site of mutation and the type of mutant residue. Further, the results show very low performance for pH values 6-8 and temperature higher than 65 for all predictors except iStable2.0 on the S630 dataset. To illustrate how stability and structure change upon single point mutation, we considered four stabilizing, two destabilizing and two stabilizing mutations from two proteins, namely the toxin protein and bovine liver cytochrome. Overall, the results on S268, S630 and S1342 datasets show that the performance of the integrated predictors is better than the mechanistic or individual machine learning predictors. We expect that this paper will provide useful guidance for the design and development of next-generation bioinformatic tools for predicting protein stability changes upon mutations.

Tan, C. W., Bergmeir, C., Petitjean, F., & Webb, G. I.
Data Mining and Knowledge Discovery, 35(3), 1032-1060, 2021.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{tan2021regression,
author = {Tan, Chang Wei and Bergmeir, Christoph and Petitjean, Francois and Webb, Geoffrey I.},
journal = {Data Mining and Knowledge Discovery},
title = {Time series extrinsic regression},
year = {2021},
issn = {1573-756X},
number = {3},
pages = {1032-1060},
volume = {35},
abstract = {This paper studies time series extrinsic regression (TSER): a regression task of which the aim is to learn the relationship between a time series and a continuous scalar variable; a task closely related to time series classification (TSC), which aims to learn the relationship between a time series and a categorical class label. This task generalizes time series forecasting, relaxing the requirement that the value predicted be a future value of the input series or primarily depend on more recent values. In this paper, we motivate and study this task, and benchmark existing solutions and adaptations of TSC algorithms on a novel archive of 19 TSER datasets which we have assembled. Our results show that the state-of-the-art TSC algorithm Rocket, when adapted for regression, achieves the highest overall accuracy compared to adaptations of other TSC algorithms and state-of-the-art machine learning (ML) algorithms such as XGBoost, Random Forest and Support Vector Regression. More importantly, we show that much research is needed in this field to improve the accuracy of ML models. We also find evidence that further research has excellent prospects of improving upon these straightforward baselines.},
doi = {10.1007/s10618-021-00745-9},
keywords = {time series},
publisher = {Springer US},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/cgCAn},
}
ABSTRACT This paper studies time series extrinsic regression (TSER): a regression task of which the aim is to learn the relationship between a time series and a continuous scalar variable; a task closely related to time series classification (TSC), which aims to learn the relationship between a time series and a categorical class label. This task generalizes time series forecasting, relaxing the requirement that the value predicted be a future value of the input series or primarily depend on more recent values. In this paper, we motivate and study this task, and benchmark existing solutions and adaptations of TSC algorithms on a novel archive of 19 TSER datasets which we have assembled. Our results show that the state-of-the-art TSC algorithm Rocket, when adapted for regression, achieves the highest overall accuracy compared to adaptations of other TSC algorithms and state-of-the-art machine learning (ML) algorithms such as XGBoost, Random Forest and Support Vector Regression. More importantly, we show that much research is needed in this field to improve the accuracy of ML models. We also find evidence that further research has excellent prospects of improving upon these straightforward baselines.

Chen, Z., Zhao, P., Li, F., Leier, A., Marquez-Lago, T. T., Webb, G. I., Baggag, A., Bensmail, H., & Song, J.
Journal of Bioinformatics and Computational Biology, 18(4), Art. no. 2050018, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Chen2020,
author = {Zhen Chen and Pei Zhao and Fuyi Li and Andr{\'{e}} Leier and Tatiana T. Marquez-Lago and Geoffrey I. Webb and Abdelkader Baggag and Halima Bensmail and Jiangning Song},
journal = {Journal of Bioinformatics and Computational Biology},
title = {{PROSPECT}: A web server for predicting protein histidine phosphorylation sites},
year = {2020},
month = {jun},
number = {4},
volume = {18},
abstract = {Background: Phosphorylation of histidine residues plays crucial roles in signaling pathwaysand cell metabolism in prokaryotes such as bacteria. While evidence has emerged that proteinhistidine phosphorylation also occurs in more complex organisms, its role in mammalian cellshas remained largely uncharted. Thus, it is highly desirable to develop computational tools thatare able to identify histidine phosphorylation sites.Result:Here, we introduce PROSPECT thatenables fast and accurate prediction of proteome-wide histidine phosphorylation substrates andsites. Our tool is based on a hybrid method that integrates the outputs of two convolutional neuralnetwork (CNN)-based classifiers and a random forest-based classifier. Three features, includingthe one-of-K coding, enhanced grouped amino acids content (EGAAC) and composition of k-spaced amino acid group pairs (CKSAAGP) encoding, were taken as the input to three classifiers,respectively. Our results show that it is able to accurately predict histidine phosphorylation sitesfrom sequence information. Our PROSPECT web server is user-friendly and publicly available athttp://PROSPECT.erc.monash.edu/. Conclusions: PROSPECT is superior than other pHispredictors in both the running speed and prediction accuracy and we anticipate that thePROSPECT webserver will become a popular tool for identifying the pHis sites in bacteria.},
articlenumber = {2050018},
doi = {10.1142/s0219720020500183},
keywords = {Bioinformatics},
publisher = {World Scientific},
related = {computational-biology},
}
ABSTRACT Background: Phosphorylation of histidine residues plays crucial roles in signaling pathwaysand cell metabolism in prokaryotes such as bacteria. While evidence has emerged that proteinhistidine phosphorylation also occurs in more complex organisms, its role in mammalian cellshas remained largely uncharted. Thus, it is highly desirable to develop computational tools thatare able to identify histidine phosphorylation sites.Result:Here, we introduce PROSPECT thatenables fast and accurate prediction of proteome-wide histidine phosphorylation substrates andsites. Our tool is based on a hybrid method that integrates the outputs of two convolutional neuralnetwork (CNN)-based classifiers and a random forest-based classifier. Three features, includingthe one-of-K coding, enhanced grouped amino acids content (EGAAC) and composition of k-spaced amino acid group pairs (CKSAAGP) encoding, were taken as the input to three classifiers,respectively. Our results show that it is able to accurately predict histidine phosphorylation sitesfrom sequence information. Our PROSPECT web server is user-friendly and publicly available athttp://PROSPECT.erc.monash.edu/. Conclusions: PROSPECT is superior than other pHispredictors in both the running speed and prediction accuracy and we anticipate that thePROSPECT webserver will become a popular tool for identifying the pHis sites in bacteria.

Pratama, M., Pedrycz, W., & Webb, G. I.
IEEE Transactions on Fuzzy Systems, 28(7), 1315-1328, 2020.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Pratama19,
author = {Pratama, M. and Pedrycz, W. and Webb, G. I.},
journal = {IEEE Transactions on Fuzzy Systems},
title = {An Incremental Construction of Deep Neuro Fuzzy System for Continual Learning of Non-stationary Data Streams},
year = {2020},
issn = {1063-6706},
number = {7},
pages = {1315-1328},
volume = {28},
doi = {10.1109/TFUZZ.2019.2939993},
keywords = {Concept Drift},
related = {learning-from-non-stationary-distributions},
}
ABSTRACT 

Encyclopedia of Machine Learning and Data Science
Phung, D., Webb, G. I., & Sammut, C. (Ed).
Springer US, 2020.
[Bibtex]  → Access on publisher site

@Book{Phung2020,
editor = {Dinh Phung and Geoffrey I. Webb and Claude Sammut},
publisher = {Springer {US}},
title = {Encyclopedia of Machine Learning and Data Science},
year = {2020},
doi = {10.1007/978-1-4899-7502-7},
}
ABSTRACT 

Dempster, A., Petitjean, F., & Webb, G. I.
Data Mining and Knowledge Discovery, 34, 1454-1495, 2020.
Second Most Highly Cited Paper in Data Mining and Knowledge Discovery in 2020
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{dempster2020rocket,
author = {Angus Dempster and Francois Petitjean and Geoffrey I. Webb},
journal = {Data Mining and Knowledge Discovery},
title = {ROCKET: Exceptionally fast and accurate time series classification using random convolutional kernels},
year = {2020},
pages = {1454-1495},
volume = {34},
abstract = {Most methods for time series classification that attain state-of-the-art accuracy have high computational complexity, requiring significant training time even for smaller datasets, and are intractable for larger datasets. Additionally, many existing methods focus on a single type of feature such as shape or frequency. Building on the recent success of convolutional neural networks for time series classification, we show that simple linear classifiers using random convolutional kernels achieve state-of-the-art accuracy with a fraction of the computational expense of existing methods. Using this method, it is possible to train and test a classifier on all 85 'bake off' datasets in the UCR archive in <2h, and it is possible to train a classifier on a large dataset of more than one million time series in approximately 1 h.},
comment = {Second Most Highly Cited Paper in Data Mining and Knowledge Discovery in 2020},
doi = {10.1007/s10618-020-00701-z},
issue = {5},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/c1zg4},
}
ABSTRACT Most methods for time series classification that attain state-of-the-art accuracy have high computational complexity, requiring significant training time even for smaller datasets, and are intractable for larger datasets. Additionally, many existing methods focus on a single type of feature such as shape or frequency. Building on the recent success of convolutional neural networks for time series classification, we show that simple linear classifiers using random convolutional kernels achieve state-of-the-art accuracy with a fraction of the computational expense of existing methods. Using this method, it is possible to train and test a classifier on all 85 'bake off' datasets in the UCR archive in <2h, and it is possible to train a classifier on a large dataset of more than one million time series in approximately 1 h.

Li, F., Leier, A., Liu, Q., Wang, Y., Xiang, D., Akutsu, T., Webb, G. I., Smith, I. A., Marquez-Lago, T., Li, J., & Song, J.
Genomics, Proteomics & Bioinformatics, 18(1), 52-64, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{LI2020,
author = {Fuyi Li and Andre Leier and Quanzhong Liu and Yanan Wang and Dongxu Xiang and Tatsuya Akutsu and Geoffrey I. Webb and A. Ian Smith and Tatiana Marquez-Lago and Jian Li and Jiangning Song},
journal = {Genomics, Proteomics & Bioinformatics},
title = {Procleave: Predicting Protease-specific Substrate Cleavage Sites by Combining Sequence and Structural Information},
year = {2020},
issn = {1672-0229},
number = {1},
pages = {52-64},
volume = {18},
abstract = {Proteases are enzymes that cleave and hydrolyse the peptide bonds between two specific amino acid residues of target substrate proteins. Protease-controlled proteolysis plays a key role in the degradation and recycling of proteins, which is essential for various physiological processes. Thus, solving the substrate identification problem will have important implications for the precise understanding of functions and physiological roles of proteases, as well as for therapeutic target identification and pharmaceutical applicability. Consequently, there is a great demand for bioinformatics methods that can predict novel substrate cleavage events with high accuracy by utilizing both sequence and structural information. In this study, we present Procleave, a novel bioinformatics approach for predicting protease-specific substrates and specific cleavage sites by taking into account both their sequence and 3D structural information. Structural features of known cleavage sites were represented by discrete values using a LOWESS data-smoothing optimization method, which turned out to be critical for the performance of Procleave. The optimal approximations of all structural parameter values were encoded in a conditional random field (CRF) computational framework, alongside sequence and chemical group-based features. Here, we demonstrate the outstanding performance of Procleave through extensive benchmarking and independent tests. Procleave is capable of correctly identifying most cleavage sites in the case study. Importantly, when applied to the human structural proteome encompassing 17,628 protein structures, Procleave suggests a number of potential novel target substrates and their corresponding cleavage sites of different proteases. Procleave is implemented as a webserver and is freely accessible at http://procleave.erc.monash.edu/.},
doi = {10.1016/j.gpb.2019.08.002},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Proteases are enzymes that cleave and hydrolyse the peptide bonds between two specific amino acid residues of target substrate proteins. Protease-controlled proteolysis plays a key role in the degradation and recycling of proteins, which is essential for various physiological processes. Thus, solving the substrate identification problem will have important implications for the precise understanding of functions and physiological roles of proteases, as well as for therapeutic target identification and pharmaceutical applicability. Consequently, there is a great demand for bioinformatics methods that can predict novel substrate cleavage events with high accuracy by utilizing both sequence and structural information. In this study, we present Procleave, a novel bioinformatics approach for predicting protease-specific substrates and specific cleavage sites by taking into account both their sequence and 3D structural information. Structural features of known cleavage sites were represented by discrete values using a LOWESS data-smoothing optimization method, which turned out to be critical for the performance of Procleave. The optimal approximations of all structural parameter values were encoded in a conditional random field (CRF) computational framework, alongside sequence and chemical group-based features. Here, we demonstrate the outstanding performance of Procleave through extensive benchmarking and independent tests. Procleave is capable of correctly identifying most cleavage sites in the case study. Importantly, when applied to the human structural proteome encompassing 17,628 protein structures, Procleave suggests a number of potential novel target substrates and their corresponding cleavage sites of different proteases. Procleave is implemented as a webserver and is freely accessible at http://procleave.erc.monash.edu/.

Zaidi, N. A., Du, Y., & Webb, G. I.
IEEE Access, 8, 198856-198871, 2020.
[Bibtex] [Abstract]  → Access on publisher site

@Article{9245528,
author = {Zaidi, N. A. and Du, Y. and Webb, G. I.},
journal = {IEEE Access},
title = {On the Effectiveness of Discretizing Quantitative Attributes in Linear Classifiers},
year = {2020},
pages = {198856-198871},
volume = {8},
abstract = {Rule ensembles are designed to provide a useful trade-off between predictive accuracy and model interpretability. However, the myopic and random search components of current rule ensemble methods can compromise this goal: they often need more rules than necessary to reach a certain accuracy level or can even outright fail to accurately model a distribution that can actually be described well with a few rules. Here, we present a novel approach aiming to fit rule ensembles of maximal predictive power for a given ensemble size (and thus model comprehensibility). In particular, we present an efficient branch-and-bound algorithm that optimally solves the per-rule objective function of the popular second-order gradient boosting framework. Our main insight is that the boosting objective can be tightly bounded in linear time of the number of covered data points. Along with an additional novel pruning technique related to rule redundancy, this leads to a computationally feasible approach for boosting optimal rules that, as we demonstrate on a wide range of common benchmark problems, consistently outperforms the predictive performance of boosting greedy rules.},
doi = {10.1109/ACCESS.2020.3034955},
}
ABSTRACT Rule ensembles are designed to provide a useful trade-off between predictive accuracy and model interpretability. However, the myopic and random search components of current rule ensemble methods can compromise this goal: they often need more rules than necessary to reach a certain accuracy level or can even outright fail to accurately model a distribution that can actually be described well with a few rules. Here, we present a novel approach aiming to fit rule ensembles of maximal predictive power for a given ensemble size (and thus model comprehensibility). In particular, we present an efficient branch-and-bound algorithm that optimally solves the per-rule objective function of the popular second-order gradient boosting framework. Our main insight is that the boosting objective can be tightly bounded in linear time of the number of covered data points. Along with an additional novel pruning technique related to rule redundancy, this leads to a computationally feasible approach for boosting optimal rules that, as we demonstrate on a wide range of common benchmark problems, consistently outperforms the predictive performance of boosting greedy rules.

IEEE 7th International Conference on Data Science and Advanced Analytics (DSAA)
Webb, G. I., Zhang, Z., Tseng, V. S., Williams, G., Vlachos, M., & Cao, L. (Ed).
IEEE Computer Society, 2020.
[Bibtex]  → Access on publisher site

@Proceedings{DSAA2020,
title = {{IEEE} 7th International Conference on Data Science and Advanced Analytics ({DSAA})},
year = {2020},
editor = {Webb, Geoffrey I and Zhang, Zhongfei and Tseng, Vincent S. and Williams, Graham and Vlachos, Michalis and Cao, Longbing},
publisher = {IEEE Computer Society},
doi = {10.1109/DSAA49011.2020},
}
ABSTRACT 

Nguyen, K., Le, T., Nguyen, T., Webb, G., & Phung, D.
IEEE Transactions on Knowledge & Data Engineering, 34(9), 4425-4438, 2020.
[Bibtex] [Abstract]  → Access on publisher site

@Article{nguyen2020,
author = {K. Nguyen and T. Le and T. Nguyen and G. Webb and D. Phung},
journal = {IEEE Transactions on Knowledge & Data Engineering},
title = {Robust Variational Learning for Multiclass Kernel Models with Stein Refinement},
year = {2020},
number = {9},
pages = {4425-4438},
volume = {34},
abstract = {Kernel-based models have a strong generalization ability, but most of them, including SVM, are vulnerable to the curse of kernelization. Moreover, their predictive performances are sensitive to the hyperparameters tuning, which highly demands computational resources. These problems render kernel methods problematic when dealing with large-scale datasets. To this end, we first formulate the optimization problem in a kernel-based learning setting as a posterior inference problem, and then develop a rich family of Recurrent Neural Network-based variational inference techniques. Unlike existing literature, which stops at the variational distribution and uses it as the surrogate for the true posterior distribution, here we further leverage Stein Variational Gradient Descent to further bring the variational distribution closer to the true posterior, we refer to this step as Stein Refinement. Putting these altogether, we arrive at a robust and efficient variational learning method for multiclass kernel machines with extremely accurate approximation. Moreover, our formulation enables efficient learning of kernel parameters and hyperparameters which robustifies the proposed method against data uncertainties. The extensive experimental results show that our method, without tuning any parameter, obtains comparable performance to LIBSVM, a well-known implementation of SVM, and outperforms other baselines while being able to seamlessly scale with large-scale datasets.},
address = {Los Alamitos, CA, USA},
doi = {10.1109/TKDE.2020.3041509},
publisher = {IEEE Computer Society},
}
ABSTRACT Kernel-based models have a strong generalization ability, but most of them, including SVM, are vulnerable to the curse of kernelization. Moreover, their predictive performances are sensitive to the hyperparameters tuning, which highly demands computational resources. These problems render kernel methods problematic when dealing with large-scale datasets. To this end, we first formulate the optimization problem in a kernel-based learning setting as a posterior inference problem, and then develop a rich family of Recurrent Neural Network-based variational inference techniques. Unlike existing literature, which stops at the variational distribution and uses it as the surrogate for the true posterior distribution, here we further leverage Stein Variational Gradient Descent to further bring the variational distribution closer to the true posterior, we refer to this step as Stein Refinement. Putting these altogether, we arrive at a robust and efficient variational learning method for multiclass kernel machines with extremely accurate approximation. Moreover, our formulation enables efficient learning of kernel parameters and hyperparameters which robustifies the proposed method against data uncertainties. The extensive experimental results show that our method, without tuning any parameter, obtains comparable performance to LIBSVM, a well-known implementation of SVM, and outperforms other baselines while being able to seamlessly scale with large-scale datasets.

Fischer, R., Piatkowski, N., Pelletier, C., Webb, G. I., Petitjean, F., & Morik, K.
IEEE 7th International Conference on Data Science and Advanced Analytics (DSAA), pp. 546-555, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{Fischer2020,
author = {Raphael Fischer and Nico Piatkowski and Charlotte Pelletier and Geoffrey I. Webb and Francois Petitjean and Katharina Morik},
booktitle = {{IEEE} 7th International Conference on Data Science and Advanced Analytics ({DSAA})},
title = {No Cloud on the Horizon: Probabilistic Gap Filling in Satellite Image Series},
year = {2020},
pages = {546-555},
publisher = {{IEEE}},
abstract = {Spatio-temporal data sets such as satellite image series are of utmost importance for understanding global developments like climate change or urbanization. However, incompleteness of data can greatly impact usability and knowledge discovery. In fact, there are many cases where not a single data point in the set is fully observed. For filling gaps, we introduce a novel approach that utilizes Markov random fields (MRFs). We extend the probabilistic framework to also consider empirical prior information, which allows to train even on highly incomplete data. Moreover, we devise a way to make discrete MRFs predict continuous values via state superposition. Experiments on real-world remote sensing imagery suffering from cloud cover show that the proposed approach outperforms state-of-the-art gap filling techniques.},
doi = {10.1109/dsaa49011.2020.00069},
keywords = {earth observation analytics},
related = {earth-observation},
}
ABSTRACT Spatio-temporal data sets such as satellite image series are of utmost importance for understanding global developments like climate change or urbanization. However, incompleteness of data can greatly impact usability and knowledge discovery. In fact, there are many cases where not a single data point in the set is fully observed. For filling gaps, we introduce a novel approach that utilizes Markov random fields (MRFs). We extend the probabilistic framework to also consider empirical prior information, which allows to train even on highly incomplete data. Moreover, we devise a way to make discrete MRFs predict continuous values via state superposition. Experiments on real-world remote sensing imagery suffering from cloud cover show that the proposed approach outperforms state-of-the-art gap filling techniques.

Tan, C. W., Petitjean, F., & Webb, G. I.
Data Mining and Knowledge Discovery, 34(1), 231-272, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Tan2019,
author = {Tan, Chang Wei and Petitjean, Fran{\c{c}}ois and Webb, Geoffrey I.},
journal = {Data Mining and Knowledge Discovery},
title = {FastEE: Fast Ensembles of Elastic Distances for time series classification},
year = {2020},
issn = {1573-756X},
number = {1},
pages = {231-272},
volume = {34},
abstract = {In recent years, many new ensemble-based time series classification (TSC) algorithms have been proposed. Each of them is significantly more accurate than their predecessors. The Hierarchical Vote Collective of Transformation-based Ensembles (HIVE-COTE) is currently the most accurate TSC algorithm when assessed on the UCR repository. It is a meta-ensemble of 5 state-of-the-art ensemble-based classifiers. The time complexity of HIVE-COTE---particularly for training---is prohibitive for most datasets. There is thus a critical need to speed up the classifiers that compose HIVE-COTE. This paper focuses on speeding up one of its components: Ensembles of Elastic Distances (EE), which is the classifier that leverages on the decades of research into the development of time-dedicated measures. Training EE can be prohibitive for many datasets. For example, it takes a month on the ElectricDevices dataset with 9000 instances. This is because EE needs to cross-validate the hyper-parameters used for the 11 similarity measures it encompasses. In this work, Fast Ensembles of Elastic Distances is proposed to train EE faster. There are two versions to it. The exact version makes it possible to train EE 10 times faster. The approximate version is 40 times faster than EE without significantly impacting the classification accuracy. This translates to being able to train EE on ElectricDevices in 13 h.},
doi = {10.1007/s10618-019-00663-x},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/c1y5a},
}
ABSTRACT In recent years, many new ensemble-based time series classification (TSC) algorithms have been proposed. Each of them is significantly more accurate than their predecessors. The Hierarchical Vote Collective of Transformation-based Ensembles (HIVE-COTE) is currently the most accurate TSC algorithm when assessed on the UCR repository. It is a meta-ensemble of 5 state-of-the-art ensemble-based classifiers. The time complexity of HIVE-COTE–-particularly for training–-is prohibitive for most datasets. There is thus a critical need to speed up the classifiers that compose HIVE-COTE. This paper focuses on speeding up one of its components: Ensembles of Elastic Distances (EE), which is the classifier that leverages on the decades of research into the development of time-dedicated measures. Training EE can be prohibitive for many datasets. For example, it takes a month on the ElectricDevices dataset with 9000 instances. This is because EE needs to cross-validate the hyper-parameters used for the 11 similarity measures it encompasses. In this work, Fast Ensembles of Elastic Distances is proposed to train EE faster. There are two versions to it. The exact version makes it possible to train EE 10 times faster. The approximate version is 40 times faster than EE without significantly impacting the classification accuracy. This translates to being able to train EE on ElectricDevices in 13 h.

Lucas, B., Pelletier, C., Schmidt, D., Webb, G. I., & Petitjean, F.
IEEE International Geoscience and Remote Sensing Symposium, pp. 1074–1077, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{lucas2020unsupervised,
author = {Lucas, Benjamin and Pelletier, Charlotte and Schmidt, Daniel and Webb, Geoffrey I and Petitjean, Fran{\c{c}}ois},
booktitle = {IEEE International Geoscience and Remote Sensing Symposium},
title = {Unsupervised Domain Adaptation Techniques for Classification of Satellite Image Time Series},
year = {2020},
organization = {IEEE},
pages = {1074--1077},
abstract = {Land cover maps are vitally important to many elements of environmental management. However the machine learning algorithms used to produce them require a substantive quantity of labelled training data to reach the best levels of accuracy. When researchers wish to map an area where no labelled training data are available, one potential solution is to use a classifier trained on another geographical area and adapting it to the target location-this is known as Unsupervised Domain Adaptation (DA). In this paper we undertake the first experiments using unsupervised DA methods for the classification of satellite image time series (SITS) data. Our experiments draw the interesting conclusion that existing methods provide no benefit when used on SITS data, and that this is likely due to the temporal nature of the data and the change in class distributions between the regions. This suggests that an unsupervised domain adaptation technique for SITS would be extremely beneficial for land cover mapping.},
keywords = {time series, earth observation analytics},
related = {scalable-time-series-classifiers},
}
ABSTRACT Land cover maps are vitally important to many elements of environmental management. However the machine learning algorithms used to produce them require a substantive quantity of labelled training data to reach the best levels of accuracy. When researchers wish to map an area where no labelled training data are available, one potential solution is to use a classifier trained on another geographical area and adapting it to the target location-this is known as Unsupervised Domain Adaptation (DA). In this paper we undertake the first experiments using unsupervised DA methods for the classification of satellite image time series (SITS) data. Our experiments draw the interesting conclusion that existing methods provide no benefit when used on SITS data, and that this is likely due to the temporal nature of the data and the change in class distributions between the regions. This suggests that an unsupervised domain adaptation technique for SITS would be extremely beneficial for land cover mapping.

Miller, L., Bolton, M., Boulton, J., Mintrom, M., Nicholson, A., Rüdiger, C., Skinner, R., Raven, R., & Webb, G. I.
IEEE/ITU International Conference on Artificial Intelligence for Good (AI4G), pp. 180-185, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{miller2020ai,
author = {Miller, Lynn and Bolton, Mitzi and Boulton, Julie and Mintrom, Michael and Nicholson, Ann and R{\"u}diger, Christoph and Skinner, Rob and Raven, Rob and Webb, Geoffrey I},
booktitle = {IEEE/ITU International Conference on Artificial Intelligence for Good (AI4G)},
title = {AI for monitoring the Sustainable Development Goals and supporting and promoting action and policy development},
year = {2020},
organization = {IEEE},
pages = {180-185},
abstract = {The United Nations sustainable development goals (SDGs) were ratified with much enthusiasm by all UN member states in 2015. However, subsequent progress to meet these goals has been hampered by a lack of data available to measure the SDG indicators (SDIs), and a lack of evidence-based insights to inform effective policy responses. We outline an interdisciplinary program of research into the use of artificial intelligence techniques to support measurement of the SDIs, using both machine learning methods to model SDI measurements and explainable AI techniques to present the outputs in a human-friendly manner. As well as addressing the technical concerns, we will investigate the governance issues of what forms of evidence, methods of collecting that evidence and means of its communication will most usefully inform effective policy development. By addressing these fundamental challenges, we aim to provide policy makers with the evidence needed to take effective action towards realising the Sustainable Development Goals.},
doi = {10.1109/AI4G50087.2020.9311014},
keywords = {earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT The United Nations sustainable development goals (SDGs) were ratified with much enthusiasm by all UN member states in 2015. However, subsequent progress to meet these goals has been hampered by a lack of data available to measure the SDG indicators (SDIs), and a lack of evidence-based insights to inform effective policy responses. We outline an interdisciplinary program of research into the use of artificial intelligence techniques to support measurement of the SDIs, using both machine learning methods to model SDI measurements and explainable AI techniques to present the outputs in a human-friendly manner. As well as addressing the technical concerns, we will investigate the governance issues of what forms of evidence, methods of collecting that evidence and means of its communication will most usefully inform effective policy development. By addressing these fundamental challenges, we aim to provide policy makers with the evidence needed to take effective action towards realising the Sustainable Development Goals.

Chen, S., Webb, G. I., Liu, L., & Ma, X.
Knowledge-Based Systems, 192, Art. no. 105361, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{CHEN2019105361,
author = {Shenglei Chen and Geoffrey I. Webb and Linyuan Liu and Xin Ma},
journal = {Knowledge-Based Systems},
title = {A novel selective naive Bayes algorithm},
year = {2020},
volume = {192},
abstract = {Naive Bayes is one of the most popular data mining algorithms. Its efficiency comes from the assumption of attribute independence, although this might be violated in many real-world data sets. Many efforts have been done to mitigate the assumption, among which attribute selection is an important approach. However, conventional efforts to perform attribute selection in naive Bayes suffer from heavy computational overhead. This paper proposes an efficient selective naive Bayes algorithm, which adopts only some of the attributes to construct selective naive Bayes models. These models are built in such a way that each one is a trivial extension of another. The most predictive selective naive Bayes model can be selected by the measures of incremental leave-one-out cross validation. As a result, attributes can be selected by efficient model selection. Empirical results demonstrate that the selective naive Bayes shows superior classification accuracy, yet at the same time maintains the simplicity and efficiency.},
articlenumber = {105361},
doi = {10.1016/j.knosys.2019.105361},
keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Naive Bayes is one of the most popular data mining algorithms. Its efficiency comes from the assumption of attribute independence, although this might be violated in many real-world data sets. Many efforts have been done to mitigate the assumption, among which attribute selection is an important approach. However, conventional efforts to perform attribute selection in naive Bayes suffer from heavy computational overhead. This paper proposes an efficient selective naive Bayes algorithm, which adopts only some of the attributes to construct selective naive Bayes models. These models are built in such a way that each one is a trivial extension of another. The most predictive selective naive Bayes model can be selected by the measures of incremental leave-one-out cross validation. As a result, attributes can be selected by efficient model selection. Empirical results demonstrate that the selective naive Bayes shows superior classification accuracy, yet at the same time maintains the simplicity and efficiency.

Chen, Z., Zhao, P., Li, F., Marquez-Lago, T. T., Leier, A., Revote, J., Zhu, Y., Powell, D. R., Akutsu, T., Webb, G. I., Chou, K., Smith, I. A., Daly, R. J., Li, J., & Song, J.
Briefings in Bioinformatics, 21(3), 1047-1057, 2020.
Clarivate Web of Science Highly Cited Paper 2020, 2021, 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbz041,
author = {Chen, Zhen and Zhao, Pei and Li, Fuyi and Marquez-Lago, Tatiana T and Leier, Andre and Revote, Jerico and Zhu, Yan and Powell, David R and Akutsu, Tatsuya and Webb, Geoffrey I and Chou, Kuo-Chen and Smith, A Ian and Daly, Roger J and Li, Jian and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {iLearn: an integrated platform and meta-learner for feature engineering, machine-learning analysis and modeling of DNA, RNA and protein sequence data},
year = {2020},
issn = {1477-4054},
number = {3},
pages = {1047-1057},
volume = {21},
abstract = {With the explosive growth of biological sequences generated in the post-genomic era, one of the most challenging problems in bioinformatics and computational biology is to computationally characterize sequences, structures and functions in an efficient, accurate and high-throughput manner. A number of online web servers and stand-alone tools have been developed to address this to date; however, all these tools have their limitations and drawbacks in terms of their effectiveness, user-friendliness and capacity. Here, we present iLearn, a comprehensive and versatile Python-based toolkit, integrating the functionality of feature extraction, clustering, normalization, selection, dimensionality reduction, predictor construction, best descriptor/model selection, ensemble learning and results visualization for DNA, RNA and protein sequences. iLearn was designed for users that only want to upload their data set and select the functions they need calculated from it, while all necessary procedures and optimal settings are completed automatically by the software. iLearn includes a variety of descriptors for DNA, RNA and proteins, and four feature output formats are supported so as to facilitate direct output usage or communication with other computational tools. In total, iLearn encompasses 16 different types of feature clustering, selection, normalization and dimensionality reduction algorithms, and five commonly used machine-learning algorithms, thereby greatly facilitating feature analysis and predictor construction. iLearn is made freely available via an online web server and a stand-alone toolkit.},
comment = {Clarivate Web of Science Highly Cited Paper 2020, 2021, 2022},
doi = {10.1093/bib/bbz041},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT With the explosive growth of biological sequences generated in the post-genomic era, one of the most challenging problems in bioinformatics and computational biology is to computationally characterize sequences, structures and functions in an efficient, accurate and high-throughput manner. A number of online web servers and stand-alone tools have been developed to address this to date; however, all these tools have their limitations and drawbacks in terms of their effectiveness, user-friendliness and capacity. Here, we present iLearn, a comprehensive and versatile Python-based toolkit, integrating the functionality of feature extraction, clustering, normalization, selection, dimensionality reduction, predictor construction, best descriptor/model selection, ensemble learning and results visualization for DNA, RNA and protein sequences. iLearn was designed for users that only want to upload their data set and select the functions they need calculated from it, while all necessary procedures and optimal settings are completed automatically by the software. iLearn includes a variety of descriptors for DNA, RNA and proteins, and four feature output formats are supported so as to facilitate direct output usage or communication with other computational tools. In total, iLearn encompasses 16 different types of feature clustering, selection, normalization and dimensionality reduction algorithms, and five commonly used machine-learning algorithms, thereby greatly facilitating feature analysis and predictor construction. iLearn is made freely available via an online web server and a stand-alone toolkit.

Chen, Z., Zhao, P., Li, F., Wang, Y., Smith, I. A., Webb, G. I., Akutsu, T., Baggag, A., Bensmail, H., & Song, J.
Briefings in Bioinformatics, 21(5), 1676-1696, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbz112,
author = {Chen, Zhen and Zhao, Pei and Li, Fuyi and Wang, Yanan and Smith, A Ian and Webb, Geoffrey I and Akutsu, Tatsuya and Baggag, Abdelkader and Bensmail, Halima and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Comprehensive review and assessment of computational methods for predicting RNA post-transcriptional modification sites from RNA sequences},
year = {2020},
issn = {1477-4054},
number = {5},
pages = {1676-1696},
volume = {21},
abstract = {RNA post-transcriptional modifications play a crucial role in a myriad of biological processes and cellular functions. To date, more than 160 RNA modifications have been discovered; therefore, accurate identification of RNA-modification sites is fundamental for a better understanding of RNA-mediated biological functions and mechanisms. However, due to limitations in experimental methods, systematic identification of different types of RNA-modification sites remains a major challenge. Recently, more than 20 computational methods have been developed to identify RNA-modification sites in tandem with high-throughput experimental methods, with most of these capable of predicting only single types of RNA-modification sites. These methods show high diversity in their dataset size, data quality, core algorithms, features extracted and feature selection techniques and evaluation strategies. Therefore, there is an urgent need to revisit these methods and summarize their methodologies, in order to improve and further develop computational techniques to identify and characterize RNA-modification sites from the large amounts of sequence data. With this goal in mind, first, we provide a comprehensive survey on a large collection of 27 state-of-the-art approaches for predicting N1-methyladenosine and N6-methyladenosine sites. We cover a variety of important aspects that are crucial for the development of successful predictors, including the dataset quality, operating algorithms, sequence and genomic features, feature selection, model performance evaluation and software utility. In addition, we also provide our thoughts on potential strategies to improve the model performance. Second, we propose a computational approach called DeepPromise based on deep learning techniques for simultaneous prediction of N1-methyladenosine and N6-methyladenosine. To extract the sequence context surrounding the modification sites, three feature encodings, including enhanced nucleic acid composition, one-hot encoding, and RNA embedding, were used as the input to seven consecutive layers of convolutional neural networks (CNNs), respectively. Moreover, DeepPromise further combined the prediction score of the CNN-based models and achieved around 43\\% higher area under receiver-operating curve (AUROC) for m1A site prediction and 6\\% higher AUROC for m6A site prediction, respectively, when compared with several existing state-of-the-art approaches on the independent test. In-depth analyses of characteristic sequence motifs identified from the convolution-layer filters indicated that nucleotide presentation at proximal positions surrounding the modification sites contributed most to the classification, whereas those at distal positions also affected classification but to different extents. To maximize user convenience, a web server was developed as an implementation of DeepPromise and made publicly available at http://DeepPromise.erc.monash.edu/, with the server accepting both RNA sequences and genomic sequences to allow prediction of two types of putative RNA-modification sites.},
doi = {10.1093/bib/bbz112},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT RNA post-transcriptional modifications play a crucial role in a myriad of biological processes and cellular functions. To date, more than 160 RNA modifications have been discovered; therefore, accurate identification of RNA-modification sites is fundamental for a better understanding of RNA-mediated biological functions and mechanisms. However, due to limitations in experimental methods, systematic identification of different types of RNA-modification sites remains a major challenge. Recently, more than 20 computational methods have been developed to identify RNA-modification sites in tandem with high-throughput experimental methods, with most of these capable of predicting only single types of RNA-modification sites. These methods show high diversity in their dataset size, data quality, core algorithms, features extracted and feature selection techniques and evaluation strategies. Therefore, there is an urgent need to revisit these methods and summarize their methodologies, in order to improve and further develop computational techniques to identify and characterize RNA-modification sites from the large amounts of sequence data. With this goal in mind, first, we provide a comprehensive survey on a large collection of 27 state-of-the-art approaches for predicting N1-methyladenosine and N6-methyladenosine sites. We cover a variety of important aspects that are crucial for the development of successful predictors, including the dataset quality, operating algorithms, sequence and genomic features, feature selection, model performance evaluation and software utility. In addition, we also provide our thoughts on potential strategies to improve the model performance. Second, we propose a computational approach called DeepPromise based on deep learning techniques for simultaneous prediction of N1-methyladenosine and N6-methyladenosine. To extract the sequence context surrounding the modification sites, three feature encodings, including enhanced nucleic acid composition, one-hot encoding, and RNA embedding, were used as the input to seven consecutive layers of convolutional neural networks (CNNs), respectively. Moreover, DeepPromise further combined the prediction score of the CNN-based models and achieved around 43\\% higher area under receiver-operating curve (AUROC) for m1A site prediction and 6\\% higher AUROC for m6A site prediction, respectively, when compared with several existing state-of-the-art approaches on the independent test. In-depth analyses of characteristic sequence motifs identified from the convolution-layer filters indicated that nucleotide presentation at proximal positions surrounding the modification sites contributed most to the classification, whereas those at distal positions also affected classification but to different extents. To maximize user convenience, a web server was developed as an implementation of DeepPromise and made publicly available at http://DeepPromise.erc.monash.edu/, with the server accepting both RNA sequences and genomic sequences to allow prediction of two types of putative RNA-modification sites.

Shifaz, A., Pelletier, C., Petitjean, F., & Webb, G. I.
Data Mining and Knowledge Discovery, 34(3), 742-775, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{shifazetal2019,
author = {Shifaz, Ahmed and Pelletier, Charlotte and Petitjean, Francois and Webb, Geoffrey I},
journal = {Data Mining and Knowledge Discovery},
title = {TS-CHIEF: A Scalable and Accurate Forest Algorithm for Time Series Classification},
year = {2020},
number = {3},
pages = {742-775},
volume = {34},
abstract = {Time Series Classification (TSC) has seen enormous progress over the last two decades. HIVE-COTE (Hierarchical Vote Collective of Transformation-based Ensembles) is the current state of the art in terms of classification accuracy. HIVE-COTE recognizes that time series data are a specific data type for which the traditional attribute-value representation, used predominantly in machine learning, fails to provide a relevant representation. HIVE-COTE combines multiple types of classifiers: each extracting information about a specific aspect of a time series, be it in the time domain, frequency domain or summarization of intervals within the series. However, HIVE-COTE (and its predecessor, FLAT-COTE) is often infeasible to run on even modest amounts of data. For instance, training HIVE-COTE on a dataset with only 1500 time series can require 8 days of CPU time. It has polynomial runtime with respect to the training set size, so this problem compounds as data quantity increases. We propose a novel TSC algorithm, TS-CHIEF (Time Series Combination of Heterogeneous and Integrated Embedding Forest), which rivals HIVE-COTE in accuracy but requires only a fraction of the runtime. TS-CHIEF constructs an ensemble classifier that integrates the most effective embeddings of time series that research has developed in the last decade. It uses tree-structured classifiers to do so efficiently. We assess TS-CHIEF on 85 datasets of the University of California Riverside (UCR) archive, where it achieves state-of-the-art accuracy with scalability and efficiency. We demonstrate that TS-CHIEF can be trained on 130 k time series in 2 days, a data quantity that is beyond the reach of any TSC algorithm with comparable accuracy.},
doi = {10.1007/s10618-020-00679-8},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/c1zg6},
}
ABSTRACT Time Series Classification (TSC) has seen enormous progress over the last two decades. HIVE-COTE (Hierarchical Vote Collective of Transformation-based Ensembles) is the current state of the art in terms of classification accuracy. HIVE-COTE recognizes that time series data are a specific data type for which the traditional attribute-value representation, used predominantly in machine learning, fails to provide a relevant representation. HIVE-COTE combines multiple types of classifiers: each extracting information about a specific aspect of a time series, be it in the time domain, frequency domain or summarization of intervals within the series. However, HIVE-COTE (and its predecessor, FLAT-COTE) is often infeasible to run on even modest amounts of data. For instance, training HIVE-COTE on a dataset with only 1500 time series can require 8 days of CPU time. It has polynomial runtime with respect to the training set size, so this problem compounds as data quantity increases. We propose a novel TSC algorithm, TS-CHIEF (Time Series Combination of Heterogeneous and Integrated Embedding Forest), which rivals HIVE-COTE in accuracy but requires only a fraction of the runtime. TS-CHIEF constructs an ensemble classifier that integrates the most effective embeddings of time series that research has developed in the last decade. It uses tree-structured classifiers to do so efficiently. We assess TS-CHIEF on 85 datasets of the University of California Riverside (UCR) archive, where it achieves state-of-the-art accuracy with scalability and efficiency. We demonstrate that TS-CHIEF can be trained on 130 k time series in 2 days, a data quantity that is beyond the reach of any TSC algorithm with comparable accuracy.

Fawaz, H. I., Lucas, B., Forestier, G., Pelletier, C., Schmidt, D. F., Weber, J., Webb, G. I., Idoumghar, L., Muller, P., & Petitjean, F.
Data Mining and Knowledge Discovery, 34, 1936-1962, 2020.
Most Highly Cited Paper Published In Data Mining and Knowledge Discovery in 2020; Clarivate Web of Science Highly Cited Paper 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{fawaz2019inceptiontime,
author = {Hassan Ismail Fawaz and Benjamin Lucas and Germain Forestier and Charlotte Pelletier and Daniel F. Schmidt and Jonathan Weber and Geoffrey I. Webb and Lhassane Idoumghar and Pierre-Alain Muller and Francois Petitjean},
journal = {Data Mining and Knowledge Discovery},
title = {InceptionTime: Finding AlexNet for Time Series Classification},
year = {2020},
pages = {1936-1962},
volume = {34},
abstract = {This paper brings deep learning at the forefront of research into time series classification (TSC). TSC is the area of machine learning tasked with the categorization (or labelling) of time series. The last few decades of work in this area have led to significant progress in the accuracy of classifiers, with the state of the art now represented by the HIVE-COTE algorithm. While extremely accurate, HIVE-COTE cannot be applied to many real-world datasets because of its high training time complexity in O(N^2 . T^4) for a dataset with N time series of length T. For example, it takes HIVE-COTE more than 8 days to learn from a small dataset with N = 1500 time series of short length T = 46. Meanwhile deep learning has received enormous attention because of its high accuracy and scalability. Recent approaches to deep learning for TSC have been scalable, but less accurate than HIVE-COTE. We introduce InceptionTime - an ensemble of deep Convolutional Neural Network models, inspired by the Inception-v4 architecture. Our experiments show that InceptionTime is on par with HIVE-COTE in terms of accuracy while being much more scalable: not only can it learn from 1500 time series in one hour but it can also learn from 8M time series in 13 h, a quantity of data that is fully out of reach of HIVE-COTE.},
comment = {Most Highly Cited Paper Published In Data Mining and Knowledge Discovery in 2020; Clarivate Web of Science Highly Cited Paper 2022},
doi = {10.1007/s10618-020-00710-y},
issue = {6},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/b6TXh},
}
ABSTRACT This paper brings deep learning at the forefront of research into time series classification (TSC). TSC is the area of machine learning tasked with the categorization (or labelling) of time series. The last few decades of work in this area have led to significant progress in the accuracy of classifiers, with the state of the art now represented by the HIVE-COTE algorithm. While extremely accurate, HIVE-COTE cannot be applied to many real-world datasets because of its high training time complexity in O(N^2 . T^4) for a dataset with N time series of length T. For example, it takes HIVE-COTE more than 8 days to learn from a small dataset with N = 1500 time series of short length T = 46. Meanwhile deep learning has received enormous attention because of its high accuracy and scalability. Recent approaches to deep learning for TSC have been scalable, but less accurate than HIVE-COTE. We introduce InceptionTime - an ensemble of deep Convolutional Neural Network models, inspired by the Inception-v4 architecture. Our experiments show that InceptionTime is on par with HIVE-COTE in terms of accuracy while being much more scalable: not only can it learn from 1500 time series in one hour but it can also learn from 8M time series in 13 h, a quantity of data that is fully out of reach of HIVE-COTE.

Goldenberg, I., & Webb, G. I.
Knowledge and Information Systems, 62, 2835-2854, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Goldenberg2020,
author = {Goldenberg, Igor and Webb, Geoffrey I.},
journal = {Knowledge and Information Systems},
title = {PCA-based drift and shift quantification framework for multidimensional data},
year = {2020},
pages = {2835-2854},
volume = {62},
abstract = {Concept drift is a serious problem confronting machine learning systems in a dynamic and ever-changing world. In order to manage concept drift it may be useful to first quantify it by measuring the distance between distributions that generate data before and after a drift. There is a paucity of methods to do so in the case of multidimensional numeric data. This paper provides an in-depth analysis of the PCA-based change detection approach, identifies shortcomings of existing methods and shows how this approach can be used to measure a drift, not merely detect it.},
doi = {10.1007/s10115-020-01438-3},
keywords = {Concept Drift},
related = {learning-from-non-stationary-distributions},
}
ABSTRACT Concept drift is a serious problem confronting machine learning systems in a dynamic and ever-changing world. In order to manage concept drift it may be useful to first quantify it by measuring the distance between distributions that generate data before and after a drift. There is a paucity of methods to do so in the case of multidimensional numeric data. This paper provides an in-depth analysis of the PCA-based change detection approach, identifies shortcomings of existing methods and shows how this approach can be used to measure a drift, not merely detect it.

Li, F., Chen, J., Leier, A., Marquez-Lago, T., Liu, Q., Wang, Y., Revote, J., Smith, I. A., Akutsu, T., Webb, G. I., Kurgan, L., & Song, J.
Bioinformatics, 36(4), 1057-1065, 2020.
Clarivate Web of Science Highly Cited Paper 2020
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Li2020a,
author = {Li, Fuyi and Chen, Jinxiang and Leier, Andre and Marquez-Lago, Tatiana and Liu, Quanzhong and Wang, Yanze and Revote, Jerico and Smith, A Ian and Akutsu, Tatsuya and Webb, Geoffrey I and Kurgan, Lukasz and Song, Jiangning},
journal = {Bioinformatics},
title = {DeepCleave: a deep learning predictor for caspase and matrix metalloprotease substrates and cleavage sites},
year = {2020},
issn = {1367-4803},
number = {4},
pages = {1057-1065},
volume = {36},
abstract = {{Proteases are enzymes that cleave target substrate proteins by catalyzing the hydrolysis of peptide bonds between specific amino acids. While the functional proteolysis regulated by proteases plays a central role in the "life and death" process of proteins, many of the corresponding substrates and their cleavage sites were not found yet. Availability of accurate predictors of the substrates and cleavage sites would facilitate understanding of proteases’ functions and physiological roles. Deep learning is a promising approach for the development of accurate predictors of substrate cleavage events.We propose DeepCleave, the first deep learning-based predictor of protease-specific substrates and cleavage sites. DeepCleave uses protein substrate sequence data as input and employs convolutional neural networks with transfer learning to train accurate predictive models. High predictive performance of our models stems from the use of high-quality cleavage site features extracted from the substrate sequences through the deep learning process, and the application of transfer learning, multiple kernels and attention layer in the design of the deep network. Empirical tests against several related state-of-the-art methods demonstrate that DeepCleave outperforms these methods in predicting caspase and matrix metalloprotease substrate-cleavage sites.The DeepCleave webserver and source code are freely available at http://deepcleave.erc.monash.edu/.Supplementary data are available at Bioinformatics online.}},
comment = {Clarivate Web of Science Highly Cited Paper 2020},
doi = {10.1093/bioinformatics/btz721},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Proteases are enzymes that cleave target substrate proteins by catalyzing the hydrolysis of peptide bonds between specific amino acids. While the functional proteolysis regulated by proteases plays a central role in the "life and death" process of proteins, many of the corresponding substrates and their cleavage sites were not found yet. Availability of accurate predictors of the substrates and cleavage sites would facilitate understanding of proteases’ functions and physiological roles. Deep learning is a promising approach for the development of accurate predictors of substrate cleavage events.We propose DeepCleave, the first deep learning-based predictor of protease-specific substrates and cleavage sites. DeepCleave uses protein substrate sequence data as input and employs convolutional neural networks with transfer learning to train accurate predictive models. High predictive performance of our models stems from the use of high-quality cleavage site features extracted from the substrate sequences through the deep learning process, and the application of transfer learning, multiple kernels and attention layer in the design of the deep network. Empirical tests against several related state-of-the-art methods demonstrate that DeepCleave outperforms these methods in predicting caspase and matrix metalloprotease substrate-cleavage sites.The DeepCleave webserver and source code are freely available at http://deepcleave.erc.monash.edu/.Supplementary data are available at Bioinformatics online.}

Li, F., Fan, C., Marquez-Lago, T. T., Leier, A., Revote, J., Jia, C., Zhu, Y., Smith, I. A., Webb, G. I., Liu, Q., Wei, L., Li, J., & Song, J.
Briefings in Bioinformatics, 21(3), 1069-1079, 2020.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{10.1093/bib/bbz050,
author = {Li, Fuyi and Fan, Cunshuo and Marquez-Lago, Tatiana T and Leier, Andre and Revote, Jerico and Jia, Cangzhi and Zhu, Yan and Smith, A Ian and Webb, Geoffrey I and Liu, Quanzhong and Wei, Leyi and Li, Jian and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {PRISMOID: a comprehensive 3D structure database for post-translational modifications and mutations with functional impact},
year = {2020},
issn = {1477-4054},
number = {3},
pages = {1069-1079},
volume = {21},
abstract = {{Post-translational modifications (PTMs) play very important roles in various cell signaling pathways and biological process. Due to PTMs' extremely important roles, many major PTMs have been studied, while the functional and mechanical characterization of major PTMs is well documented in several databases. However, most currently available databases mainly focus on protein sequences, while the real 3D structures of PTMs have been largely ignored. Therefore, studies of PTMs 3D structural signatures have been severely limited by the deficiency of the data. Here, we develop PRISMOID, a novel publicly available and free 3D structure database for a wide range of PTMs. PRISMOID represents an up-to-date and interactive online knowledge base with specific focus on 3D structural contexts of PTMs sites and mutations that occur on PTMs and in the close proximity of PTM sites with functional impact. The first version of PRISMOID encompasses 17 145 non-redundant modification sites on 3919 related protein 3D structure entries pertaining to 37 different types of PTMs. Our entry web page is organized in a comprehensive manner, including detailed PTM annotation on the 3D structure and biological information in terms of mutations affecting PTMs, secondary structure features and per-residue solvent accessibility features of PTM sites, domain context, predicted natively disordered regions and sequence alignments. In addition, high-definition JavaScript packages are employed to enhance information visualization in PRISMOID. PRISMOID equips a variety of interactive and customizable search options and data browsing functions; these capabilities allow users to access data via keyword, ID and advanced options combination search in an efficient and user-friendly way. A download page is also provided to enable users to download the SQL file, computational structural features and PTM sites’ data. We anticipate PRISMOID will swiftly become an invaluable online resource, assisting both biologists and bioinformaticians to conduct experiments and develop applications supporting discovery efforts in the sequence–structural–functional relationship of PTMs and providing important insight into mutations and PTM sites interaction mechanisms. The PRISMOID database is freely accessible at http://prismoid.erc.monash.edu/. The database and web interface are implemented in MySQL, JSP, JavaScript and HTML with all major browsers supported.}},
doi = {10.1093/bib/bbz050},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Post-translational modifications (PTMs) play very important roles in various cell signaling pathways and biological process. Due to PTMs' extremely important roles, many major PTMs have been studied, while the functional and mechanical characterization of major PTMs is well documented in several databases. However, most currently available databases mainly focus on protein sequences, while the real 3D structures of PTMs have been largely ignored. Therefore, studies of PTMs 3D structural signatures have been severely limited by the deficiency of the data. Here, we develop PRISMOID, a novel publicly available and free 3D structure database for a wide range of PTMs. PRISMOID represents an up-to-date and interactive online knowledge base with specific focus on 3D structural contexts of PTMs sites and mutations that occur on PTMs and in the close proximity of PTM sites with functional impact. The first version of PRISMOID encompasses 17 145 non-redundant modification sites on 3919 related protein 3D structure entries pertaining to 37 different types of PTMs. Our entry web page is organized in a comprehensive manner, including detailed PTM annotation on the 3D structure and biological information in terms of mutations affecting PTMs, secondary structure features and per-residue solvent accessibility features of PTM sites, domain context, predicted natively disordered regions and sequence alignments. In addition, high-definition JavaScript packages are employed to enhance information visualization in PRISMOID. PRISMOID equips a variety of interactive and customizable search options and data browsing functions; these capabilities allow users to access data via keyword, ID and advanced options combination search in an efficient and user-friendly way. A download page is also provided to enable users to download the SQL file, computational structural features and PTM sites’ data. We anticipate PRISMOID will swiftly become an invaluable online resource, assisting both biologists and bioinformaticians to conduct experiments and develop applications supporting discovery efforts in the sequence–structural–functional relationship of PTMs and providing important insight into mutations and PTM sites interaction mechanisms. The PRISMOID database is freely accessible at http://prismoid.erc.monash.edu/. The database and web interface are implemented in MySQL, JSP, JavaScript and HTML with all major browsers supported.}

Wang, X., Li, C., Li, F., Sharma, V. S., Song, J., & Webb, G. I.
BMC Bioinformatics, 20(1), Art. no. 602, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Wang2019,
author = {Wang, Xiaochuan and Li, Chen and Li, Fuyi and Sharma, Varun S. and Song, Jiangning and Webb, Geoffrey I.},
journal = {BMC Bioinformatics},
title = {SIMLIN: a bioinformatics tool for prediction of S-sulphenylation in the human proteome based on multi-stage ensemble-learning models},
year = {2019},
month = {Nov},
number = {1},
volume = {20},
abstract = {S-sulphenylation is a ubiquitous protein post-translational modification (PTM) where an S-hydroxyl (−SOH) bond is formed via the reversible oxidation on the Sulfhydryl group of cysteine (C). Recent experimental studies have revealed that S-sulphenylation plays critical roles in many biological functions, such as protein regulation and cell signaling. State-of-the-art bioinformatic advances have facilitated high-throughput in silico screening of protein S-sulphenylation sites, thereby significantly reducing the time and labour costs traditionally required for the experimental investigation of S-sulphenylation.},
articlenumber = {602},
doi = {10.1186/s12859-019-3178-6},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT S-sulphenylation is a ubiquitous protein post-translational modification (PTM) where an S-hydroxyl (−SOH) bond is formed via the reversible oxidation on the Sulfhydryl group of cysteine (C). Recent experimental studies have revealed that S-sulphenylation plays critical roles in many biological functions, such as protein regulation and cell signaling. State-of-the-art bioinformatic advances have facilitated high-throughput in silico screening of protein S-sulphenylation sites, thereby significantly reducing the time and labour costs traditionally required for the experimental investigation of S-sulphenylation.

Pelletier, C., Webb, G. I., & Petitjean, F.
IEEE International Geoscience And Remote Sensing Symposium, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{PelletierEtAl19b,
author = {Pelletier, Charlotte and Webb, Geoffrey I. and Petitjean, Francois},
booktitle = {IEEE International Geoscience And Remote Sensing Symposium},
title = {Deep Learning for the Classification of Sentinel-2 Image Series},
year = {2019},
month = {Jul},
abstract = {Satellite image time series (SITS) have proven to be essential for accurate and up-to-date land cover mapping over large areas. Most works about SITS have focused on the use of traditional classification algorithms such as Random Forests (RFs). Deep learning algorithms have been very successful for supervised tasks, in particular for data that exhibit a structure between attributes, such as space or time. In this work, we compare for the first time RFs to the two leading deep learning algorithms for handling temporal data: Recurrent Neural Networks (RNNs) and temporal Convolutional Neural Networks (TempCNNs). We carry out a large experiment using Sentinel-2 time series. We compare both accuracy and computational times to classify 10,980 km 2 over Australia. The results highlights the good performance of TemCNNs that obtain the highest accuracy. They also show that RNNs might be less suited for large scale study as they have higher runtime complexity.},
keywords = {time series, earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT Satellite image time series (SITS) have proven to be essential for accurate and up-to-date land cover mapping over large areas. Most works about SITS have focused on the use of traditional classification algorithms such as Random Forests (RFs). Deep learning algorithms have been very successful for supervised tasks, in particular for data that exhibit a structure between attributes, such as space or time. In this work, we compare for the first time RFs to the two leading deep learning algorithms for handling temporal data: Recurrent Neural Networks (RNNs) and temporal Convolutional Neural Networks (TempCNNs). We carry out a large experiment using Sentinel-2 time series. We compare both accuracy and computational times to classify 10,980 km 2 over Australia. The results highlights the good performance of TemCNNs that obtain the highest accuracy. They also show that RNNs might be less suited for large scale study as they have higher runtime complexity.

Hamalainen, W., & Webb, G. I.
Data Mining and Knowledge Discovery, 33(2), 325-377, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{HamalainenWebb2019,
author = {Hamalainen, Wilhelmiina and Webb, Geoffrey I.},
journal = {Data Mining and Knowledge Discovery},
title = {A tutorial on statistically sound pattern discovery},
year = {2019},
issn = {1573-756X},
month = {March},
number = {2},
pages = {325-377},
volume = {33},
abstract = {Statistically sound pattern discovery harnesses the rigour of statistical hypothesis testing to overcome many of the issues that have hampered standard data mining approaches to pattern discovery. Most importantly, application of appropriate statistical tests allows precise control over the risk of false discoveries---patterns that are found in the sample data but do not hold in the wider population from which the sample was drawn. Statistical tests can also be applied to filter out patterns that are unlikely to be useful, removing uninformative variations of the key patterns in the data. This tutorial introduces the key statistical and data mining theory and techniques that underpin this fast developing field. We concentrate on two general classes of patterns: dependency rules that express statistical dependencies between condition and consequent parts and dependency sets that express mutual dependence between set elements. We clarify alternative interpretations of statistical dependence and introduce appropriate tests for evaluating statistical significance of patterns in different situations. We also introduce special techniques for controlling the likelihood of spurious discoveries when multitudes of patterns are evaluated. The paper is aimed at a wide variety of audiences. It provides the necessary statistical background and summary of the state-of-the-art for any data mining researcher or practitioner wishing to enter or understand statistically sound pattern discovery research or practice. It can serve as a general introduction to the field of statistically sound pattern discovery for any reader with a general background in data sciences.},
doi = {10.1007/s10618-018-0590-x},
keywords = {Association Rule Discovery and statistically sound discovery},
related = {statistically-sound-association-discovery},
url = {https://rdcu.be/bd2MI},
}
ABSTRACT Statistically sound pattern discovery harnesses the rigour of statistical hypothesis testing to overcome many of the issues that have hampered standard data mining approaches to pattern discovery. Most importantly, application of appropriate statistical tests allows precise control over the risk of false discoveries–-patterns that are found in the sample data but do not hold in the wider population from which the sample was drawn. Statistical tests can also be applied to filter out patterns that are unlikely to be useful, removing uninformative variations of the key patterns in the data. This tutorial introduces the key statistical and data mining theory and techniques that underpin this fast developing field. We concentrate on two general classes of patterns: dependency rules that express statistical dependencies between condition and consequent parts and dependency sets that express mutual dependence between set elements. We clarify alternative interpretations of statistical dependence and introduce appropriate tests for evaluating statistical significance of patterns in different situations. We also introduce special techniques for controlling the likelihood of spurious discoveries when multitudes of patterns are evaluated. The paper is aimed at a wide variety of audiences. It provides the necessary statistical background and summary of the state-of-the-art for any data mining researcher or practitioner wishing to enter or understand statistically sound pattern discovery research or practice. It can serve as a general introduction to the field of statistically sound pattern discovery for any reader with a general background in data sciences.

Li, F., Zhang, Y., Purcell, A. W., Webb, G. I., Chou, K., Lithgow, T., Li, C., & Song, J.
BMC Bioinformatics, 20(1), 112, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Li2019,
author = {Li, Fuyi and Zhang, Yang and Purcell, Anthony W. and Webb, Geoffrey I. and Chou, Kuo-Chen and Lithgow, Trevor and Li, Chen and Song, Jiangning},
journal = {BMC Bioinformatics},
title = {Positive-unlabelled learning of glycosylation sites in the human proteome},
year = {2019},
issn = {1471-2105},
month = {Mar},
number = {1},
pages = {112},
volume = {20},
abstract = {As an important type of post-translational modification (PTM), protein glycosylation plays a crucial role in protein stability and protein function. The abundance and ubiquity of protein glycosylation across three domains of life involving Eukarya, Bacteria and Archaea demonstrate its roles in regulating a variety of signalling and metabolic pathways. Mutations on and in the proximity of glycosylation sites are highly associated with human diseases. Accordingly, accurate prediction of glycosylation can complement laboratory-based methods and greatly benefit experimental efforts for characterization and understanding of functional roles of glycosylation. For this purpose, a number of supervised-learning approaches have been proposed to identify glycosylation sites, demonstrating a promising predictive performance. To train a conventional supervised-learning model, both reliable positive and negative samples are required. However, in practice, a large portion of negative samples (i.e. non-glycosylation sites) are mislabelled due to the limitation of current experimental technologies. Moreover, supervised algorithms often fail to take advantage of large volumes of unlabelled data, which can aid in model learning in conjunction with positive samples (i.e. experimentally verified glycosylation sites).},
doi = {10.1186/s12859-019-2700-1},
keywords = {Bioinformatics},
related = {computational-biology},
url = {https://rdcu.be/bpQBV},
}
ABSTRACT As an important type of post-translational modification (PTM), protein glycosylation plays a crucial role in protein stability and protein function. The abundance and ubiquity of protein glycosylation across three domains of life involving Eukarya, Bacteria and Archaea demonstrate its roles in regulating a variety of signalling and metabolic pathways. Mutations on and in the proximity of glycosylation sites are highly associated with human diseases. Accordingly, accurate prediction of glycosylation can complement laboratory-based methods and greatly benefit experimental efforts for characterization and understanding of functional roles of glycosylation. For this purpose, a number of supervised-learning approaches have been proposed to identify glycosylation sites, demonstrating a promising predictive performance. To train a conventional supervised-learning model, both reliable positive and negative samples are required. However, in practice, a large portion of negative samples (i.e. non-glycosylation sites) are mislabelled due to the limitation of current experimental technologies. Moreover, supervised algorithms often fail to take advantage of large volumes of unlabelled data, which can aid in model learning in conjunction with positive samples (i.e. experimentally verified glycosylation sites).

Lucas, B., Shifaz, A., Pelletier, C., O'Neill, L., Zaidi, N., Goethals, B., Petitjean, F., & Webb, G. I.
Data Mining and Knowledge Discovery, 33, 607-635, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{LucasEtAl2019,
author = {Lucas, Benjamin and Shifaz, Ahmed and Pelletier, Charlotte and O'Neill, Lachlan and Zaidi, Nayyar and Goethals, Bart and Petitjean, Francois and Webb, Geoffrey I.},
journal = {Data Mining and Knowledge Discovery},
title = {Proximity Forest: an effective and scalable distance-based classifier for time series},
year = {2019},
issn = {1573-756X},
pages = {607-635},
volume = {33},
abstract = {Research into the classification of time series has made enormous progress in the last decade. The UCR time series archive has played a significant role in challenging and guiding the development of new learners for time series classification. The largest dataset in the UCR archive holds 10,000 time series only; which may explain why the primary research focus has been on creating algorithms that have high accuracy on relatively small datasets. This paper introduces Proximity Forest, an algorithm that learns accurate models from datasets with millions of time series, and classifies a time series in milliseconds. The models are ensembles of highly randomized Proximity Trees. Whereas conventional decision trees branch on attribute values (and usually perform poorly on time series), Proximity Trees branch on the proximity of time series to one exemplar time series or another; allowing us to leverage the decades of work into developing relevant measures for time series. Proximity Forest gains both efficiency and accuracy by stochastic selection of both exemplars and similarity measures. Our work is motivated by recent time series applications that provide orders of magnitude more time series than the UCR benchmarks. Our experiments demonstrate that Proximity Forest is highly competitive on the UCR archive: it ranks among the most accurate classifiers while being significantly faster. We demonstrate on a 1M time series Earth observation dataset that Proximity Forest retains this accuracy on datasets that are many orders of magnitude greater than those in the UCR repository, while learning its models at least 100,000 times faster than current state-of-the-art models Elastic Ensemble and COTE.},
doi = {10.1007/s10618-019-00617-3},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {https://rdcu.be/blB8E},
}
ABSTRACT Research into the classification of time series has made enormous progress in the last decade. The UCR time series archive has played a significant role in challenging and guiding the development of new learners for time series classification. The largest dataset in the UCR archive holds 10,000 time series only; which may explain why the primary research focus has been on creating algorithms that have high accuracy on relatively small datasets. This paper introduces Proximity Forest, an algorithm that learns accurate models from datasets with millions of time series, and classifies a time series in milliseconds. The models are ensembles of highly randomized Proximity Trees. Whereas conventional decision trees branch on attribute values (and usually perform poorly on time series), Proximity Trees branch on the proximity of time series to one exemplar time series or another; allowing us to leverage the decades of work into developing relevant measures for time series. Proximity Forest gains both efficiency and accuracy by stochastic selection of both exemplars and similarity measures. Our work is motivated by recent time series applications that provide orders of magnitude more time series than the UCR benchmarks. Our experiments demonstrate that Proximity Forest is highly competitive on the UCR archive: it ranks among the most accurate classifiers while being significantly faster. We demonstrate on a 1M time series Earth observation dataset that Proximity Forest retains this accuracy on datasets that are many orders of magnitude greater than those in the UCR repository, while learning its models at least 100,000 times faster than current state-of-the-art models Elastic Ensemble and COTE.

Zhang, Y., Xie, R., Wang, J., Leier, A., Marquez-Lago, T. T., Akutsu, T., Webb, G. I., Chou, K., & Song, J.
Briefings in Bioinformatics, 20(6), 2185-2199, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ZhangEtAl18,
author = {Zhang, Yanju and Xie, Ruopeng and Wang, Jiawei and Leier, Andre and Marquez-Lago, Tatiana T. and Akutsu, Tatsuya and Webb, Geoffrey I. and Chou, Kuo-Chen and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Computational analysis and prediction of lysine malonylation sites by exploiting informative features in an integrative machine-learning framework},
year = {2019},
number = {6},
pages = {2185-2199},
volume = {20},
abstract = {As a newly discovered post-translational modification (PTM), lysine malonylation (Kmal) regulates a myriad of cellular processes from prokaryotes to eukaryotes and has important implications in human diseases. Despite its functional significance, computational methods to accurately identify malonylation sites are still lacking and urgently needed. In particular, there is currently no comprehensive analysis and assessment of different features and machine learning (ML) methods that are required for constructing the necessary prediction models. Here, we review, analyze and compare 11 different feature encoding methods, with the goal of extracting key patterns and characteristics from residue sequences of Kmal sites. We identify optimized feature sets, with which four commonly used ML methods (random forest, support vector machines, K-nearest neighbor and logistic regression) and one recently proposed [Light Gradient Boosting Machine (LightGBM)] are trained on data from three species, namely, Escherichia coli, Mus musculus and Homo sapiens, and compared using randomized 10-fold cross-validation tests. We show that integration of the single method-based models through ensemble learning further improves the prediction performance and model robustness on the independent test. When compared to the existing state-of-the-art predictor, MaloPred, the optimal ensemble models were more accurate for all three species (AUC: 0.930, 0.923 and 0.944 for E. coli, M. musculus and H. sapiens, respectively). Using the ensemble models, we developed an accessible online predictor, kmal-sp, available at http://kmalsp.erc.monash.edu/. We hope that this comprehensive survey and the proposed strategy for building more accurate models can serve as a useful guide for inspiring future developments of computational methods for PTM site prediction, expedite the discovery of new malonylation and other PTM types and facilitate hypothesis-driven experimental validation of novel malonylated substrates and malonylation sites.},
doi = {10.1093/bib/bby079},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT As a newly discovered post-translational modification (PTM), lysine malonylation (Kmal) regulates a myriad of cellular processes from prokaryotes to eukaryotes and has important implications in human diseases. Despite its functional significance, computational methods to accurately identify malonylation sites are still lacking and urgently needed. In particular, there is currently no comprehensive analysis and assessment of different features and machine learning (ML) methods that are required for constructing the necessary prediction models. Here, we review, analyze and compare 11 different feature encoding methods, with the goal of extracting key patterns and characteristics from residue sequences of Kmal sites. We identify optimized feature sets, with which four commonly used ML methods (random forest, support vector machines, K-nearest neighbor and logistic regression) and one recently proposed [Light Gradient Boosting Machine (LightGBM)] are trained on data from three species, namely, Escherichia coli, Mus musculus and Homo sapiens, and compared using randomized 10-fold cross-validation tests. We show that integration of the single method-based models through ensemble learning further improves the prediction performance and model robustness on the independent test. When compared to the existing state-of-the-art predictor, MaloPred, the optimal ensemble models were more accurate for all three species (AUC: 0.930, 0.923 and 0.944 for E. coli, M. musculus and H. sapiens, respectively). Using the ensemble models, we developed an accessible online predictor, kmal-sp, available at http://kmalsp.erc.monash.edu/. We hope that this comprehensive survey and the proposed strategy for building more accurate models can serve as a useful guide for inspiring future developments of computational methods for PTM site prediction, expedite the discovery of new malonylation and other PTM types and facilitate hypothesis-driven experimental validation of novel malonylated substrates and malonylation sites.

Goldenberg, I., & Webb, G. I.
Knowledge and Information Systems, 60(2), 591-615, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Goldenberg2018,
Title = {Survey of distance measures for quantifying concept drift and shift in numeric data},
Author = {Goldenberg, Igor and Webb, Geoffrey I.},
Journal = {Knowledge and Information Systems},
Year = {2019},
Number = {2},
Pages = {591-615},
Volume = {60},
Abstract = {Deployed machine learning systems are necessarily learned from historical data and are often applied to current data. When the world changes, the learned models can lose fidelity. Such changes to the statistical properties of data over time are known as concept drift. Similarly, models are often learned in one context, but need to be applied in another. This is called concept shift. Quantifying the magnitude of drift or shift, especially in the context of covariate drift or shift, or unsupervised learning, requires use of measures of distance between distributions. In this paper, we survey such distance measures with respect to their suitability for estimating drift and shift magnitude between samples of numeric data.},
Doi = {10.1007/s10115-018-1257-z},
ISSN = {0219-3116},
Keywords = {Concept Drift},
Related = {learning-from-non-stationary-distributions}
}
ABSTRACT Deployed machine learning systems are necessarily learned from historical data and are often applied to current data. When the world changes, the learned models can lose fidelity. Such changes to the statistical properties of data over time are known as concept drift. Similarly, models are often learned in one context, but need to be applied in another. This is called concept shift. Quantifying the magnitude of drift or shift, especially in the context of covariate drift or shift, or unsupervised learning, requires use of measures of distance between distributions. In this paper, we survey such distance measures with respect to their suitability for estimating drift and shift magnitude between samples of numeric data.

Wang, J., Yang, B., An, Y., Marquez-Lago, T., Leier, A., Wilksch, J., Hong, Q., Zhang, Y., Hayashida, M., Akutsu, T., Webb, G. I., Strugnell, R. A., Song, J., & Lithgow, T.
Briefings in Bioinformatics, 20(3), 931-951, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{doi:10.1093/bib/bbx164,
author = {Wang, Jiawei and Yang, Bingjiao and An, Yi and Marquez-Lago, Tatiana and Leier, Andre and Wilksch, Jonathan and Hong, Qingyang and Zhang, Yang and Hayashida, Morihiro and Akutsu, Tatsuya and Webb, Geoffrey I and Strugnell, Richard A and Song, Jiangning and Lithgow, Trevor},
journal = {Briefings in Bioinformatics},
title = {Systematic analysis and prediction of type IV secreted effector proteins by machine learning approaches},
year = {2019},
number = {3},
pages = {931-951},
volume = {20},
abstract = {In the course of infecting their hosts, pathogenic bacteria secrete numerous effectors, namely, bacterial proteins that pervert host cell biology. Many Gram-negative bacteria, including context-dependent human pathogens, use a type IV secretion system (T4SS) to translocate effectors directly into the cytosol of host cells. Various type IV secreted effectors (T4SEs) have been experimentally validated to play crucial roles in virulence by manipulating host cell gene expression and other processes. Consequently, the identification of novel effector proteins is an important step in increasing our understanding of host-pathogen interactions and bacterial pathogenesis. Here, we train and compare six machine learning models, namely, Naive Bayes (NB), K-nearest neighbor (KNN), logistic regression (LR), random forest (RF), support vector machines (SVMs) and multilayer perceptron (MLP), for the identification of T4SEs using 10 types of selected features and 5-fold cross-validation. Our study shows that: (1) including different but complementary features generally enhance the predictive performance of T4SEs; (2) ensemble models, obtained by integrating individual single-feature models, exhibit a significantly improved predictive performance and (3) the 'majority voting strategy' led to a more stable and accurate classification performance when applied to predicting an ensemble learning model with distinct single features. We further developed a new method to effectively predict T4SEs, Bastion4 (Bacterial secretion effector predictor for T4SS), and we show our ensemble classifier clearly outperforms two recent prediction tools. In summary, we developed a state-of-the-art T4SE predictor by conducting a comprehensive performance evaluation of different machine learning algorithms along with a detailed analysis of single- and multi-feature selections.},
doi = {10.1093/bib/bbx164},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT In the course of infecting their hosts, pathogenic bacteria secrete numerous effectors, namely, bacterial proteins that pervert host cell biology. Many Gram-negative bacteria, including context-dependent human pathogens, use a type IV secretion system (T4SS) to translocate effectors directly into the cytosol of host cells. Various type IV secreted effectors (T4SEs) have been experimentally validated to play crucial roles in virulence by manipulating host cell gene expression and other processes. Consequently, the identification of novel effector proteins is an important step in increasing our understanding of host-pathogen interactions and bacterial pathogenesis. Here, we train and compare six machine learning models, namely, Naive Bayes (NB), K-nearest neighbor (KNN), logistic regression (LR), random forest (RF), support vector machines (SVMs) and multilayer perceptron (MLP), for the identification of T4SEs using 10 types of selected features and 5-fold cross-validation. Our study shows that: (1) including different but complementary features generally enhance the predictive performance of T4SEs; (2) ensemble models, obtained by integrating individual single-feature models, exhibit a significantly improved predictive performance and (3) the 'majority voting strategy' led to a more stable and accurate classification performance when applied to predicting an ensemble learning model with distinct single features. We further developed a new method to effectively predict T4SEs, Bastion4 (Bacterial secretion effector predictor for T4SS), and we show our ensemble classifier clearly outperforms two recent prediction tools. In summary, we developed a state-of-the-art T4SE predictor by conducting a comprehensive performance evaluation of different machine learning algorithms along with a detailed analysis of single- and multi-feature selections.

Yu, H., & Webb, G. I.
Neurocomputing, 343, 141-153, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{yuetal2019,
Title = {Adaptive Online Extreme Learning Machine by Regulating Forgetting Factor by Concept Drift Map},
Author = {Yu, Hualong and Webb, Geoffrey I},
Journal = {Neurocomputing},
Year = {2019},
Pages = {141-153},
Volume = {343},
Abstract = {In online-learning, the data is incrementally received and the distributions from which it is drawn may keep changing over time. This phenomenon is widely known as concept drift. Such changes may affect the generalization of a learned model to future data. This problem may be exacerbated by the form of the drift itself changing over time. Quantitative measures to describe and analyze the concept drift have been proposed in previous work. A description composed from these measures is called a concept drift map. We believe that these maps could be useful for guiding how much knowledge in the old model should be forgotten. Therefore, this paper presents an adaptive online learning model that uses a concept drift map to regulate the forgetting factor of an extreme learning machine. Specifically, when a batch of new instances are labeled, the distribution of each class on each attribute is firstly estimated, and then it is compared with the distribution estimated in the previous batch to calculate the magnitude of concept drift, which is further used to regulate the forgetting factor and to update the learning model. Therefore, the novelty of this paper lies in that a quantitative distance metric between two distributions constructed on continuous attribute space is presented to construct concept drift map which can be further associated with the forgetting factor to make the learning model adapt the concept drift. Experimental results on several benchmark stream data sets show the proposed model is generally superior to several previous algorithms when classifying a variety of data streams subject to drift, indicating its effectiveness and feasibility.},
Doi = {10.1016/j.neucom.2018.11.098},
Keywords = {Concept Drift},
Publisher = {Elsevier},
Related = {learning-from-non-stationary-distributions}
}
ABSTRACT In online-learning, the data is incrementally received and the distributions from which it is drawn may keep changing over time. This phenomenon is widely known as concept drift. Such changes may affect the generalization of a learned model to future data. This problem may be exacerbated by the form of the drift itself changing over time. Quantitative measures to describe and analyze the concept drift have been proposed in previous work. A description composed from these measures is called a concept drift map. We believe that these maps could be useful for guiding how much knowledge in the old model should be forgotten. Therefore, this paper presents an adaptive online learning model that uses a concept drift map to regulate the forgetting factor of an extreme learning machine. Specifically, when a batch of new instances are labeled, the distribution of each class on each attribute is firstly estimated, and then it is compared with the distribution estimated in the previous batch to calculate the magnitude of concept drift, which is further used to regulate the forgetting factor and to update the learning model. Therefore, the novelty of this paper lies in that a quantitative distance metric between two distributions constructed on continuous attribute space is presented to construct concept drift map which can be further associated with the forgetting factor to make the learning model adapt the concept drift. Experimental results on several benchmark stream data sets show the proposed model is generally superior to several previous algorithms when classifying a variety of data streams subject to drift, indicating its effectiveness and feasibility.

Lucas, B., Pelletier, C., Inglada, J., Schmidt, D., Webb, G. I., & Petitjean, F.
Proceedings 10th International Workshop on the Analysis of Multitemporal Remote Sensing Images, MultiTemp 2019, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{LucasEtAl2019b,
author = {Lucas, B. and Pelletier, C. and Inglada, J. and Schmidt, D. and Webb, G. I. and Petitjean, F},
booktitle = {Proceedings 10th International Workshop on the Analysis of Multitemporal Remote Sensing Images, MultiTemp 2019},
title = {Exploring Data Quantity Requirements for Domain Adaptation in the Classification of Satellite Image Time Series},
year = {2019},
publisher = {IEEE, Institute of Electrical and Electronics Engineers},
abstract = {Land cover maps are a vital input variable in all types of environmental research and management. However the modern state-of-The-Art machine learning techniques used to create them require substantial training data to produce optimal accuracy. Domain Adaptation is one technique researchers might use when labelled training data are unavailable or scarce. This paper looks at the result of training a convolutional neural network model on a region where data are available (source domain), and then adapting this model to another region (target domain) by retraining it on the available labelled data, and in particular how these results change with increasing data availability. Our experiments performing domain adaptation on satellite image time series, draw three interesting conclusions: (1) a model trained only on data from the source domain delivers 73.0% test accuracy on the target domain; (2) when all of the weights are retrained on the target data, over 16,000 instances were required to improve upon the accuracy of the source-only model; and (3) even if sufficient data is available in the target domain, using a model pretrained on a source domain will result in better overall test accuracy compared to a model trained on target domain data only-88.9% versus 84.7%.},
doi = {10.1109/Multi-Temp.2019.8866898},
keywords = {time series, earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT Land cover maps are a vital input variable in all types of environmental research and management. However the modern state-of-The-Art machine learning techniques used to create them require substantial training data to produce optimal accuracy. Domain Adaptation is one technique researchers might use when labelled training data are unavailable or scarce. This paper looks at the result of training a convolutional neural network model on a region where data are available (source domain), and then adapting this model to another region (target domain) by retraining it on the available labelled data, and in particular how these results change with increasing data availability. Our experiments performing domain adaptation on satellite image time series, draw three interesting conclusions: (1) a model trained only on data from the source domain delivers 73.0% test accuracy on the target domain; (2) when all of the weights are retrained on the target data, over 16,000 instances were required to improve upon the accuracy of the source-only model; and (3) even if sufficient data is available in the target domain, using a model pretrained on a source domain will result in better overall test accuracy compared to a model trained on target domain data only-88.9% versus 84.7%.

Song, J., Wang, Y., Li, F., Akutsu, T., Rawlings, N. D., Webb, G. I., & Chou, K.
Briefings in Bioinformatics, 20(2), 638-658, 2019.
Clarivate Web of Science Hot Paper 2019 and Highly Cited Paper 2019, 2020, 2021, 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{doi:10.1093/bib/bby028,
author = {Song, Jiangning and Wang, Yanan and Li, Fuyi and Akutsu, Tatsuya and Rawlings, Neil D and Webb, Geoffrey I and Chou, Kuo-Chen},
journal = {Briefings in Bioinformatics},
title = {iProt-Sub: a comprehensive package for accurately mapping and predicting protease-specific substrates and cleavage sites},
year = {2019},
number = {2},
pages = {638-658},
volume = {20},
abstract = {Regulation of proteolysis plays a critical role in a myriad of important cellular processes. The key to better understanding the mechanisms that control this process is to identify the specific substrates that each protease targets. To address this, we have developed iProt-Sub, a powerful bioinformatics tool for the accurate prediction of protease-specific substrates and their cleavage sites. Importantly, iProt-Sub represents a significantly advanced version of its successful predecessor, PROSPER. It provides optimized cleavage site prediction models with better prediction performance and coverage for more species-specific proteases (4 major protease families and 38 different proteases). iProt-Sub integrates heterogeneous sequence and structural features and uses a two-step feature selection procedure to further remove redundant and irrelevant features in an effort to improve the cleavage site prediction accuracy. Features used by iProt-Sub are encoded by 11 different sequence encoding schemes, including local amino acid sequence profile, secondary structure, solvent accessibility and native disorder, which will allow a more accurate representation of the protease specificity of approximately 38 proteases and training of the prediction models. Benchmarking experiments using cross-validation and independent tests showed that iProt-Sub is able to achieve a better performance than several existing generic tools. We anticipate that iProt-Sub will be a powerful tool for proteome-wide prediction of protease-specific substrates and their cleavage sites, and will facilitate hypothesis-driven functional interrogation of protease-specific substrate cleavage and proteolytic events.},
comment = {Clarivate Web of Science Hot Paper 2019 and Highly Cited Paper 2019, 2020, 2021, 2022},
doi = {10.1093/bib/bby028},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Regulation of proteolysis plays a critical role in a myriad of important cellular processes. The key to better understanding the mechanisms that control this process is to identify the specific substrates that each protease targets. To address this, we have developed iProt-Sub, a powerful bioinformatics tool for the accurate prediction of protease-specific substrates and their cleavage sites. Importantly, iProt-Sub represents a significantly advanced version of its successful predecessor, PROSPER. It provides optimized cleavage site prediction models with better prediction performance and coverage for more species-specific proteases (4 major protease families and 38 different proteases). iProt-Sub integrates heterogeneous sequence and structural features and uses a two-step feature selection procedure to further remove redundant and irrelevant features in an effort to improve the cleavage site prediction accuracy. Features used by iProt-Sub are encoded by 11 different sequence encoding schemes, including local amino acid sequence profile, secondary structure, solvent accessibility and native disorder, which will allow a more accurate representation of the protease specificity of approximately 38 proteases and training of the prediction models. Benchmarking experiments using cross-validation and independent tests showed that iProt-Sub is able to achieve a better performance than several existing generic tools. We anticipate that iProt-Sub will be a powerful tool for proteome-wide prediction of protease-specific substrates and their cleavage sites, and will facilitate hypothesis-driven functional interrogation of protease-specific substrate cleavage and proteolytic events.

Li, F., Wang, Y., Li, C., Marquez-Lago, T. T., Leier, A., Rawlings, N. D., Haffari, G., Revote, J., Akutsu, T., Chou, K., Purcell, A. W., Pike, R. N., Webb, G. I., Smith, I. A., Lithgow, T., Daly, R. J., Whisstock, J. C., & Song, J.
Briefings in Bioinformatics, 20(6), 2150-2166, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Li18b,
author = {Li, Fuyi and Wang, Yanan and Li, Chen and Marquez-Lago, Tatiana T and Leier, Andre and Rawlings, Neil D and Haffari, Gholamreza and Revote, Jerico and Akutsu, Tatsuya and Chou, Kuo-Chen and Purcell, Anthony W and Pike, Robert N and Webb, Geoffrey I and Smith, Ian A and Lithgow, Trevor and Daly, Roger J and Whisstock, James C and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {Twenty years of bioinformatics research for protease-specific substrate and cleavage site prediction: a comprehensive revisit and benchmarking of existing methods},
year = {2019},
number = {6},
pages = {2150-2166},
volume = {20},
abstract = {The roles of proteolytic cleavage have been intensively investigated and discussed during the past two decades. This irreversible chemical process has been frequently reported to influence a number of crucial biological processes (BPs), such as cell cycle, protein regulation and inflammation. A number of advanced studies have been published aiming at deciphering the mechanisms of proteolytic cleavage. Given its significance and the large number of functionally enriched substrates targeted by specific proteases, many computational approaches have been established for accurate prediction of protease-specific substrates and their cleavage sites. Consequently, there is an urgent need to systematically assess the state-of-the-art computational approaches for protease-specific cleavage site prediction to further advance the existing methodologies and to improve the prediction performance. With this goal in mind, in this article, we carefully evaluated a total of 19 computational methods (including 8 scoring function-based methods and 11 machine learning-based methods) in terms of their underlying algorithm, calculated features, performance evaluation and software usability. Then, extensive independent tests were performed to assess the robustness and scalability of the reviewed methods using our carefully prepared independent test data sets with 3641 cleavage sites (specific to 10 proteases). The comparative experimental results demonstrate that PROSPERous is the most accurate generic method for predicting eight protease-specific cleavage sites, while GPS-CCD and LabCaS outperformed other predictors for calpain-specific cleavage sites. Based on our review, we then outlined some potential ways to improve the prediction performance and ease the computational burden by applying ensemble learning, deep learning, positive unlabeled learning and parallel and distributed computing techniques. We anticipate that our study will serve as a practical and useful guide for interested readers to further advance next-generation bioinformatics tools for protease-specific cleavage site prediction.},
doi = {10.1093/bib/bby077},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT The roles of proteolytic cleavage have been intensively investigated and discussed during the past two decades. This irreversible chemical process has been frequently reported to influence a number of crucial biological processes (BPs), such as cell cycle, protein regulation and inflammation. A number of advanced studies have been published aiming at deciphering the mechanisms of proteolytic cleavage. Given its significance and the large number of functionally enriched substrates targeted by specific proteases, many computational approaches have been established for accurate prediction of protease-specific substrates and their cleavage sites. Consequently, there is an urgent need to systematically assess the state-of-the-art computational approaches for protease-specific cleavage site prediction to further advance the existing methodologies and to improve the prediction performance. With this goal in mind, in this article, we carefully evaluated a total of 19 computational methods (including 8 scoring function-based methods and 11 machine learning-based methods) in terms of their underlying algorithm, calculated features, performance evaluation and software usability. Then, extensive independent tests were performed to assess the robustness and scalability of the reviewed methods using our carefully prepared independent test data sets with 3641 cleavage sites (specific to 10 proteases). The comparative experimental results demonstrate that PROSPERous is the most accurate generic method for predicting eight protease-specific cleavage sites, while GPS-CCD and LabCaS outperformed other predictors for calpain-specific cleavage sites. Based on our review, we then outlined some potential ways to improve the prediction performance and ease the computational burden by applying ensemble learning, deep learning, positive unlabeled learning and parallel and distributed computing techniques. We anticipate that our study will serve as a practical and useful guide for interested readers to further advance next-generation bioinformatics tools for protease-specific cleavage site prediction.

Tan, C. W., Petitjean, F., & Webb, G. I.
Proceedings of the 2019 SIAM International Conference on Data Mining, pp. 522-530, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{TanEtAl19,
Title = {Elastic bands across the path: A new framework and methods to lower bound DTW},
Author = {Tan, Chang Wei and Petitjean, Francois and Webb, Geoffrey I.},
Booktitle = {Proceedings of the 2019 SIAM International Conference on Data Mining},
Year = {2019},
Pages = {522-530},
Abstract = {There has been renewed recent interest in developing effective lower bounds for Dynamic Time Warping (DTW) distance between time series. These have many applications in time series indexing, clustering, forecasting, regression and classification. One of the key time series classification algorithms, the nearest neighbor algorithm with DTW distance (NN-DTW) is very expensive to compute, due to the quadratic complexity of DTW. Lower bound search can speed up NN-DTW substantially. An effective and tight lower bound quickly prunes off unpromising nearest neighbor candidates from the search space and minimises the number of the costly DTW computations. The speed up provided by lower bound search becomes increasingly critical as training set size increases. Different lower bounds provide different trade-offs between computation time and tightness. Most existing lower bounds interact with DTW warping window sizes. They are very tight and effective at smaller warping window sizes, but become looser as the warping window increases, thus reducing the pruning effectiveness for NN-DTW. In this work, we present a new class of lower bounds that are tighter than the popular Keogh lower bound, while requiring similar computation time. Our new lower bounds take advantage of the DTW boundary condition, monotonicity and continuity constraints to create a tighter lower bound. Of particular significance, they remain relatively tight even for large windows. A single parameter to these new lower bounds controls the speed-tightness trade-off. We demonstrate that these new lower bounds provide an exceptional balance between computation time and tightness for the NN-DTW time series classification task, resulting in greatly improved efficiency for NN-DTW lower bound search.},
Keywords = {time series},
Related = {scalable-time-series-classifiers},
Url = {https://arxiv.org/abs/1808.09617}
}
ABSTRACT There has been renewed recent interest in developing effective lower bounds for Dynamic Time Warping (DTW) distance between time series. These have many applications in time series indexing, clustering, forecasting, regression and classification. One of the key time series classification algorithms, the nearest neighbor algorithm with DTW distance (NN-DTW) is very expensive to compute, due to the quadratic complexity of DTW. Lower bound search can speed up NN-DTW substantially. An effective and tight lower bound quickly prunes off unpromising nearest neighbor candidates from the search space and minimises the number of the costly DTW computations. The speed up provided by lower bound search becomes increasingly critical as training set size increases. Different lower bounds provide different trade-offs between computation time and tightness. Most existing lower bounds interact with DTW warping window sizes. They are very tight and effective at smaller warping window sizes, but become looser as the warping window increases, thus reducing the pruning effectiveness for NN-DTW. In this work, we present a new class of lower bounds that are tighter than the popular Keogh lower bound, while requiring similar computation time. Our new lower bounds take advantage of the DTW boundary condition, monotonicity and continuity constraints to create a tighter lower bound. Of particular significance, they remain relatively tight even for large windows. A single parameter to these new lower bounds controls the speed-tightness trade-off. We demonstrate that these new lower bounds provide an exceptional balance between computation time and tightness for the NN-DTW time series classification task, resulting in greatly improved efficiency for NN-DTW lower bound search.

Pelletier, C., Webb, G. I., & Petitjean, F.
Remote Sensing, 11(5), Art. no. 523, 2019.
Clarivate Web of Science Highly Cited Paper 2021, 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{PelletierEtAl19,
author = {Pelletier, Charlotte and Webb, Geoffrey I. and Petitjean, Francois},
journal = {Remote Sensing},
title = {Temporal Convolutional Neural Network for the Classification of Satellite Image Time Series},
year = {2019},
issn = {2072-4292},
number = {5},
volume = {11},
abstract = {Latest remote sensing sensors are capable of acquiring high spatial and spectral Satellite Image Time Series (SITS) of the world. These image series are a key component of classification systems that aim at obtaining up-to-date and accurate land cover maps of the Earth’s surfaces. More specifically, current SITS combine high temporal, spectral and spatial resolutions, which makes it possible to closely monitor vegetation dynamics. Although traditional classification algorithms, such as Random Forest (RF), have been successfully applied to create land cover maps from SITS, these algorithms do not make the most of the temporal domain. This paper proposes a comprehensive study of Temporal Convolutional Neural Networks (TempCNNs), a deep learning approach which applies convolutions in the temporal dimension in order to automatically learn temporal (and spectral) features. The goal of this paper is to quantitatively and qualitatively evaluate the contribution of TempCNNs for SITS classification, as compared to RF and Recurrent Neural Networks (RNNs) —a standard deep learning approach that is particularly suited to temporal data. We carry out experiments on Formosat-2 scene with 46 images and one million labelled time series. The experimental results show that TempCNNs are more accurate than the current state of the art for SITS classification. We provide some general guidelines on the network architecture, common regularization mechanisms, and hyper-parameter values such as batch size; we also draw out some differences with standard results in computer vision (e.g., about pooling layers). Finally, we assess the visual quality of the land cover maps produced by TempCNNs.},
articlenumber = {523},
comment = {Clarivate Web of Science Highly Cited Paper 2021, 2022},
doi = {10.3390/rs11050523},
keywords = {time series, earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT Latest remote sensing sensors are capable of acquiring high spatial and spectral Satellite Image Time Series (SITS) of the world. These image series are a key component of classification systems that aim at obtaining up-to-date and accurate land cover maps of the Earth’s surfaces. More specifically, current SITS combine high temporal, spectral and spatial resolutions, which makes it possible to closely monitor vegetation dynamics. Although traditional classification algorithms, such as Random Forest (RF), have been successfully applied to create land cover maps from SITS, these algorithms do not make the most of the temporal domain. This paper proposes a comprehensive study of Temporal Convolutional Neural Networks (TempCNNs), a deep learning approach which applies convolutions in the temporal dimension in order to automatically learn temporal (and spectral) features. The goal of this paper is to quantitatively and qualitatively evaluate the contribution of TempCNNs for SITS classification, as compared to RF and Recurrent Neural Networks (RNNs) —a standard deep learning approach that is particularly suited to temporal data. We carry out experiments on Formosat-2 scene with 46 images and one million labelled time series. The experimental results show that TempCNNs are more accurate than the current state of the art for SITS classification. We provide some general guidelines on the network architecture, common regularization mechanisms, and hyper-parameter values such as batch size; we also draw out some differences with standard results in computer vision (e.g., about pooling layers). Finally, we assess the visual quality of the land cover maps produced by TempCNNs.

Pelletier, C., Ji, Z., Hagolle, O., Morse-McNabb, E., Sheffield, K., Webb, G. I., & Petitjean, F.
Proceedings 10th International Workshop on the Analysis of Multitemporal Remote Sensing Images, MultiTemp 2019, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{PelletierEtAl19c,
author = {Pelletier, C. and Ji, Z. and Hagolle, O. and Morse-McNabb, E. and Sheffield, K. and Webb, G. I. and Petitjean, F.},
booktitle = {Proceedings 10th International Workshop on the Analysis of Multitemporal Remote Sensing Images, MultiTemp 2019},
title = {Using Sentinel-2 Image Time Series to map the State of Victoria, Australia},
year = {2019},
abstract = {Sentinel-2 satellites are now acquiring images of the entire Earth every five days from 10 to 60 m spatial resolution. The supervised classification of this new optical image time series allows the operational production of accurate land cover maps over large areas. In this paper, we investigate the use of one year of Sentinel-2 data to map the state of Victoria in Australia. In particular, we produce two land cover maps using the most established and advanced algorithms in time series classification: Random Forest (RF) and Temporal Convolutional Neural Network (TempCNN). To our knowledge, these are the first land cover maps at 10 m spatial resolution for an Australian state.},
doi = {10.1109/Multi-Temp.2019.8866921},
keywords = {cartography;convolutional neural nets;geophysical image processing;image classification;image resolution;land cover;optical images;optical information processing;remote sensing;terrain mapping;time series;TempCNN;temporal convolutional neural network;random forest;land cover maps;Victoria state;Australian state;spatial resolution;time series classification;Sentinel-2 data;accurate land cover maps;operational production;optical image time series;supervised classification;Sentinel-2 satellites;Australia;sentinel-2 image time series;Radio frequency;Australia;Spatial resolution;Time series analysis;Agriculture;Convolutional neural networks;Sentinel-2 images;land cover map;time series;Temporal Convolutional Neural Networks;Random Forests;earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT Sentinel-2 satellites are now acquiring images of the entire Earth every five days from 10 to 60 m spatial resolution. The supervised classification of this new optical image time series allows the operational production of accurate land cover maps over large areas. In this paper, we investigate the use of one year of Sentinel-2 data to map the state of Victoria in Australia. In particular, we produce two land cover maps using the most established and advanced algorithms in time series classification: Random Forest (RF) and Temporal Convolutional Neural Network (TempCNN). To our knowledge, these are the first land cover maps at 10 m spatial resolution for an Australian state.

Chen, Z., Li, L., Xu, D., Chou, K., Liu, X., Smith, A. I., Li, F., Song, J., Li, C., Leier, A., Marquez-Lago, T., Akutsu, T., & Webb, G. I.
Briefings in Bioinformatics, 20(6), 2267-2290, 2019.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ChenEtAl118b,
author = {Chen, Zhen and Li, Lei and Xu, Dakang and Chou, Kuo-Chen and Liu, Xuhan and Smith, Alexander Ian and Li, Fuyi and Song, Jiangning and Li, Chen and Leier, Andre and Marquez-Lago, Tatiana and Akutsu, Tatsuya and Webb, Geoffrey I},
journal = {Briefings in Bioinformatics},
title = {Large-scale comparative assessment of computational predictors for lysine post-translational modification sites},
year = {2019},
number = {6},
pages = {2267-2290},
volume = {20},
abstract = {Lysine post-translational modifications (PTMs) play a crucial role in regulating diverse functions and biological processes of proteins. However, because of the large volumes of sequencing data generated from genome-sequencing projects, systematic identification of different types of lysine PTM substrates and PTM sites in the entire proteome remains a major challenge. In recent years, a number of computational methods for lysine PTM identification have been developed. These methods show high diversity in their core algorithms, features extracted and feature selection techniques and evaluation strategies. There is therefore an urgent need to revisit these methods and summarize their methodologies, to improve and further develop computational techniques to identify and characterize lysine PTMs from the large amounts of sequence data. With this goal in mind, we first provide a comprehensive survey on a large collection of 49 state-of-the-art approaches for lysine PTM prediction. We cover a variety of important aspects that are crucial for the development of successful predictors, including operating algorithms, sequence and structural features, feature selection, model performance evaluation and software utility. We further provide our thoughts on potential strategies to improve the model performance. Second, in order to examine the feasibility of using deep learning for lysine PTM prediction, we propose a novel computational framework, termed MUscADEL (Multiple Scalable Accurate Deep Learner for lysine PTMs), using deep, bidirectional, long short-term memory recurrent neural networks for accurate and systematic mapping of eight major types of lysine PTMs in the human and mouse proteomes. Extensive benchmarking tests show that MUscADEL outperforms current methods for lysine PTM characterization, demonstrating the potential and power of deep learning techniques in protein PTM prediction. The web server of MUscADEL, together with all the data sets assembled in this study, is freely available at http://muscadel.erc.monash.edu/. We anticipate this comprehensive review and the application of deep learning will provide practical guide and useful insights into PTM prediction and inspire future bioinformatics studies in the related fields.},
doi = {10.1093/bib/bby089},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Lysine post-translational modifications (PTMs) play a crucial role in regulating diverse functions and biological processes of proteins. However, because of the large volumes of sequencing data generated from genome-sequencing projects, systematic identification of different types of lysine PTM substrates and PTM sites in the entire proteome remains a major challenge. In recent years, a number of computational methods for lysine PTM identification have been developed. These methods show high diversity in their core algorithms, features extracted and feature selection techniques and evaluation strategies. There is therefore an urgent need to revisit these methods and summarize their methodologies, to improve and further develop computational techniques to identify and characterize lysine PTMs from the large amounts of sequence data. With this goal in mind, we first provide a comprehensive survey on a large collection of 49 state-of-the-art approaches for lysine PTM prediction. We cover a variety of important aspects that are crucial for the development of successful predictors, including operating algorithms, sequence and structural features, feature selection, model performance evaluation and software utility. We further provide our thoughts on potential strategies to improve the model performance. Second, in order to examine the feasibility of using deep learning for lysine PTM prediction, we propose a novel computational framework, termed MUscADEL (Multiple Scalable Accurate Deep Learner for lysine PTMs), using deep, bidirectional, long short-term memory recurrent neural networks for accurate and systematic mapping of eight major types of lysine PTMs in the human and mouse proteomes. Extensive benchmarking tests show that MUscADEL outperforms current methods for lysine PTM characterization, demonstrating the potential and power of deep learning techniques in protein PTM prediction. The web server of MUscADEL, together with all the data sets assembled in this study, is freely available at http://muscadel.erc.monash.edu/. We anticipate this comprehensive review and the application of deep learning will provide practical guide and useful insights into PTM prediction and inspire future bioinformatics studies in the related fields.

Wang, H., Feng, L., Webb, G. I., Kurgan, L., Song, J., & Lin, D.
Briefings in Bioinformatics, 19(5), 838-852, 2018.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WangEtAl18,
Title = {Critical evaluation of bioinformatics tools for the prediction of protein crystallization propensity},
Author = {Wang, Huilin and Feng, Liubin and Webb, Geoffrey I and Kurgan, Lukasz and Song, Jiangning and Lin, Donghai},
Journal = {Briefings in Bioinformatics},
Year = {2018},
Number = {5},
Pages = {838-852},
Volume = {19},
Abstract = {X-ray crystallography is the main tool for structural determination of proteins. Yet, the underlying crystallization process is costly, has a high attrition rate and involves a series of trial-and-error attempts to obtain diffraction-quality crystals. The Structural Genomics Consortium aims to systematically solve representative structures of major protein-fold classes using primarily high-throughput X-ray crystallography. The attrition rate of these efforts can be improved by selection of proteins that are potentially easier to be crystallized. In this context, bioinformatics approaches have been developed to predict crystallization propensities based on protein sequences. These approaches are used to facilitate prioritization of the most promising target proteins, search for alternative structural orthologues of the target proteins and suggest designs of constructs capable of potentially enhancing the likelihood of successful crystallization. We reviewed and compared nine predictors of protein crystallization propensity. Moreover, we demonstrated that integrating selected outputs from multiple predictors as candidate input features to build the predictive model results in a significantly higher predictive performance when compared to using these predictors individually. Furthermore, we also introduced a new and accurate predictor of protein crystallization propensity, Crysf, which uses functional features extracted from UniProt as inputs. This comprehensive review will assist structural biologists in selecting the most appropriate predictor, and is also beneficial for bioinformaticians to develop a new generation of predictive algorithms.},
Doi = {10.1093/bib/bbx018},
Keywords = {Bioinformatics},
Related = {computational-biology}
}
ABSTRACT X-ray crystallography is the main tool for structural determination of proteins. Yet, the underlying crystallization process is costly, has a high attrition rate and involves a series of trial-and-error attempts to obtain diffraction-quality crystals. The Structural Genomics Consortium aims to systematically solve representative structures of major protein-fold classes using primarily high-throughput X-ray crystallography. The attrition rate of these efforts can be improved by selection of proteins that are potentially easier to be crystallized. In this context, bioinformatics approaches have been developed to predict crystallization propensities based on protein sequences. These approaches are used to facilitate prioritization of the most promising target proteins, search for alternative structural orthologues of the target proteins and suggest designs of constructs capable of potentially enhancing the likelihood of successful crystallization. We reviewed and compared nine predictors of protein crystallization propensity. Moreover, we demonstrated that integrating selected outputs from multiple predictors as candidate input features to build the predictive model results in a significantly higher predictive performance when compared to using these predictors individually. Furthermore, we also introduced a new and accurate predictor of protein crystallization propensity, Crysf, which uses functional features extracted from UniProt as inputs. This comprehensive review will assist structural biologists in selecting the most appropriate predictor, and is also beneficial for bioinformaticians to develop a new generation of predictive algorithms.

Chen, Z., Zhao, P., Li, F., Leier, A., Marquez-Lago, T. T., Wang, Y., Webb, G. I., Smith, I. A., Daly, R. J., Chou, K., & Song, J.
Bioinformatics, 2499-2502, 2018.
Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021, 2022
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ChenEtAl18,
author = {Chen, Zhen and Zhao, Pei and Li, Fuyi and Leier, Andre and Marquez-Lago, Tatiana T and Wang, Yanan and Webb, Geoffrey I and Smith, A Ian and Daly, Roger J and Chou, Kuo-Chen and Song, Jiangning},
journal = {Bioinformatics},
title = {iFeature: a python package and web server for features extraction and selection from protein and peptide sequences},
year = {2018},
pages = {2499-2502},
abstract = {Structural and physiochemical descriptors extracted from sequence data have been widely used to represent sequences and predict structural, functional, expression and interaction profiles of proteins and peptides as well as DNAs/RNAs. Here, we present iFeature, a versatile Python-based toolkit for generating various numerical feature representation schemes for both protein and peptide sequences. iFeature is capable of calculating and extracting a comprehensive spectrum of 18 major sequence encoding schemes that encompass 53 different types of feature descriptors. It also allows users to extract specific amino acid properties from the AAindex database. Furthermore, iFeature integrates 12 different types of commonly used feature clustering, selection and dimensionality reduction algorithms, greatly facilitating training, analysis and benchmarking of machine-learning models. The functionality of iFeature is made freely available via an online web server and a stand-alone toolkit.},
comment = {Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021, 2022},
doi = {10.1093/bioinformatics/bty140},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Structural and physiochemical descriptors extracted from sequence data have been widely used to represent sequences and predict structural, functional, expression and interaction profiles of proteins and peptides as well as DNAs/RNAs. Here, we present iFeature, a versatile Python-based toolkit for generating various numerical feature representation schemes for both protein and peptide sequences. iFeature is capable of calculating and extracting a comprehensive spectrum of 18 major sequence encoding schemes that encompass 53 different types of feature descriptors. It also allows users to extract specific amino acid properties from the AAindex database. Furthermore, iFeature integrates 12 different types of commonly used feature clustering, selection and dimensionality reduction algorithms, greatly facilitating training, analysis and benchmarking of machine-learning models. The functionality of iFeature is made freely available via an online web server and a stand-alone toolkit.

Li, C., Clark, L. V. T., Zhang, R., Porebski, B. T., McCoey, J. M., Borg, N. A., Webb, G. I., Kass, I., Buckle, M., Song, J., Woolfson, A., & Buckle, A. M.
Journal of Molecular Biology, 430(18), 3200-3217, 2018.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Li2018,
Title = {Structural Capacitance in Protein Evolution and Human Diseases},
Author = {Li, Chen and Clark, Liah V.T. and Zhang, Rory and Porebski, Benjamin T. and McCoey, Julia M. and Borg, Natalie A. and Webb, Geoffrey I. and Kass, Itamar and Buckle, Malcolm and Song, Jiangning and Woolfson, Adrian and Buckle, Ashley M.},
Journal = {Journal of Molecular Biology},
Year = {2018},
Number = {18},
Pages = {3200-3217},
Volume = {430},
Doi = {10.1016/j.jmb.2018.06.051},
ISSN = {0022-2836},
Keywords = {Bioinformatics},
Related = {computational-biology}
}
ABSTRACT 

Advances in Knowledge Discovery and Data Mining: 22nd Pacific-Asia Conference, PAKDD 2018, Melbourne, VIC, Australia, June 3-6, 2018, Proceedings (Vol. 10939)
Phung, D., Tseng, V. S., Webb, G. I., Ho, B., Ganji, M., & Rashidi, L. (Ed).
Springer, 2018.
[Bibtex]  → Access on publisher site

@Proceedings{phung2018advances,
Title = {Advances in Knowledge Discovery and Data Mining: 22nd Pacific-Asia Conference, PAKDD 2018, Melbourne, VIC, Australia, June 3-6, 2018, Proceedings},
Year = {2018},
Editor = {Phung, Dinh and Tseng, Vincent S. and Webb, Geoffrey I. and Ho, Bao and Ganji, Mohadeseh and Rashidi, Lida},
Publisher = {Springer},
Volume = {10939},
}
ABSTRACT 

Petitjean, F., Buntine, W., Webb, G. I., & Zaidi, N.
Machine Learning, 107(8-10), 1303-1331, 2018.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Petitjean2018,
Title = {Accurate parameter estimation for Bayesian network classifiers using hierarchical Dirichlet processes},
Author = {Petitjean, Francois and Buntine, Wray and Webb, Geoffrey I. and Zaidi, Nayyar},
Journal = {Machine Learning},
Year = {2018},
Number = {8-10},
Pages = {1303-1331},
Volume = {107},
Abstract = {This paper introduces a novel parameter estimation method for the probability tables of Bayesian network classifiers (BNCs), using hierarchical Dirichlet processes (HDPs). The main result of this paper is to show that improved parameter estimation allows BNCs to outperform leading learning methods such as random forest for both 0--1 loss and RMSE, albeit just on categorical datasets. As data assets become larger, entering the hyped world of big'', efficient accurate classification requires three main elements: (1) classifiers with low-bias that can capture the fine-detail of large datasets (2) out-of-core learners that can learn from data without having to hold it all in main memory and (3) models that can classify new data very efficiently. The latest BNCs satisfy these requirements. Their bias can be controlled easily by increasing the number of parents of the nodes in the graph. Their structure can be learned out of core with a limited number of passes over the data. However, as the bias is made lower to accurately model classification tasks, so is the accuracy of their parameters' estimates, as each parameter is estimated from ever decreasing quantities of data. In this paper, we introduce the use of HDPs for accurate BNC parameter estimation even with lower bias. We conduct an extensive set of experiments on 68 standard datasets and demonstrate that our resulting classifiers perform very competitively with random forest in terms of prediction, while keeping the out-of-core capability and superior classification time.},
Doi = {10.1007/s10994-018-5718-0},
ISSN = {1573-0565},
Keywords = {Conditional Probability Estimation and Bayesian Learning},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {https://rdcu.be/OX3g}
}
ABSTRACT This paper introduces a novel parameter estimation method for the probability tables of Bayesian network classifiers (BNCs), using hierarchical Dirichlet processes (HDPs). The main result of this paper is to show that improved parameter estimation allows BNCs to outperform leading learning methods such as random forest for both 0–1 loss and RMSE, albeit just on categorical datasets. As data assets become larger, entering the hyped world of big'', efficient accurate classification requires three main elements: (1) classifiers with low-bias that can capture the fine-detail of large datasets (2) out-of-core learners that can learn from data without having to hold it all in main memory and (3) models that can classify new data very efficiently. The latest BNCs satisfy these requirements. Their bias can be controlled easily by increasing the number of parents of the nodes in the graph. Their structure can be learned out of core with a limited number of passes over the data. However, as the bias is made lower to accurately model classification tasks, so is the accuracy of their parameters' estimates, as each parameter is estimated from ever decreasing quantities of data. In this paper, we introduce the use of HDPs for accurate BNC parameter estimation even with lower bias. We conduct an extensive set of experiments on 68 standard datasets and demonstrate that our resulting classifiers perform very competitively with random forest in terms of prediction, while keeping the out-of-core capability and superior classification time.

Manapragada, C., Webb, G. I., & Salehi, M.
Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, New York, NY, USA, pp. 1953–1962, 2018.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ManapragadaEtAl18,
Title = {Extremely Fast Decision Tree},
Author = {Manapragada, Chaitanya and Webb, Geoffrey I. and Salehi, Mahsa},
Booktitle = {Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
Year = {2018},
Address = {New York, NY, USA},
Pages = {1953--1962},
Publisher = {ACM},
Series = {KDD '18},
Abstract = {We introduce a novel incremental decision tree learning algorithm, Hoeffding Anytime Tree, that is statistically more efficient than the current state-of-the-art, Hoeffding Tree. We demonstrate that an implementation of Hoeffding Anytime Tree---"Extremely Fast Decision Tree", a minor modification to the MOA implementation of Hoeffding Tree---obtains significantly superior prequential accuracy on most of the largest classification datasets from the UCI repository. Hoeffding Anytime Tree produces the asymptotic batch tree in the limit, is naturally resilient to concept drift, and can be used as a higher accuracy replacement for Hoeffding Tree in most scenarios, at a small additional computational cost.},
Acmid = {3220005},
Doi = {10.1145/3219819.3220005},
ISBN = {978-1-4503-5552-0},
Keywords = {Concept Drift},
Location = {London, United Kingdom},
Related = {learning-from-non-stationary-distributions}
}
ABSTRACT We introduce a novel incremental decision tree learning algorithm, Hoeffding Anytime Tree, that is statistically more efficient than the current state-of-the-art, Hoeffding Tree. We demonstrate that an implementation of Hoeffding Anytime Tree–-"Extremely Fast Decision Tree", a minor modification to the MOA implementation of Hoeffding Tree–-obtains significantly superior prequential accuracy on most of the largest classification datasets from the UCI repository. Hoeffding Anytime Tree produces the asymptotic batch tree in the limit, is naturally resilient to concept drift, and can be used as a higher accuracy replacement for Hoeffding Tree in most scenarios, at a small additional computational cost.

Song, J., Li, F., Takemoto, K., Haffari, G., Akutsu, T., Chou, K. C., & Webb, G. I.
Journal of Theoretical Biology, 443, 125-137, 2018.
Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{SongEtAl18,
author = {Song, J. and Li, F. and Takemoto, K. and Haffari, G. and Akutsu, T. and Chou, K. C. and Webb, G. I.},
journal = {Journal of Theoretical Biology},
title = {PREvaIL, an integrative approach for inferring catalytic residues using sequence, structural, and network features in a machine-learning framework},
year = {2018},
pages = {125-137},
volume = {443},
comment = {Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020},
doi = {10.1016/j.jtbi.2018.01.023},
keywords = {Bioinformatics},
related = {computational-biology},
url = {https://authors.elsevier.com/c/1WWQY57ilzyRc},
}
ABSTRACT 

Zaidi, N. A., Petitjean, F., & Webb, G. I.
Proceedings of the 2018 SIAM International Conference on Data Mining, pp. 459-467, 2018.
[Bibtex]  → Access on publisher site  → Related papers and software

@InProceedings{Zaidi2018,
Title = {Efficient and Effective Accelerated Hierarchical Higher-Order Logistic Regression for Large Data Quantities},
Author = {Zaidi, Nayyar A. and Petitjean, Francois and Webb, Geoffrey I.},
Booktitle = {Proceedings of the 2018 SIAM International Conference on Data Mining},
Year = {2018},
Pages = {459-467},
Doi = {10.1137/1.9781611975321.52},
Keywords = {Conditional Probability Estimation and WANBIA},
Related = {combining-generative-and-discriminative-learning}
}
ABSTRACT 

Nguyen, K., Le, T., Nguyen, T. D., Phung, D., & Webb, G. I.
Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, New York, NY, USA, pp. 2003–2011, 2018.
[Bibtex] [Abstract]  → Access on publisher site

@InProceedings{nguyenetal18,
author = {Nguyen, Khanh and Le, Trung and Nguyen, Tu Dinh and Phung, Dinh and Webb, Geoffrey I.},
booktitle = {Proceedings of the 24th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
title = {Robust Bayesian Kernel Machine via Stein Variational Gradient Descent for Big Data},
year = {2018},
address = {New York, NY, USA},
pages = {2003--2011},
publisher = {ACM},
series = {KDD '18},
abstract = {Kernel methods are powerful supervised machine learning models for their strong generalization ability, especially on limited data to effectively generalize on unseen data. However, most kernel methods, including the state-of-the-art LIBSVM, are vulnerable to the curse of kernelization, making them infeasible to apply to large-scale datasets. This issue is exacerbated when kernel methods are used in conjunction with a grid search to tune their kernel parameters and hyperparameters which brings in the question of model robustness when applied to real datasets. In this paper, we propose a robust Bayesian Kernel Machine (BKM) - a Bayesian kernel machine that exploits the strengths of both the Bayesian modelling and kernel methods. A key challenge for such a formulation is the need for an efficient learning algorithm. To this end, we successfully extended the recent Stein variational theory for Bayesian inference for our proposed model, resulting in fast and efficient learning and prediction algorithms. Importantly our proposed BKM is resilient to the curse of kernelization, hence making it applicable to large-scale datasets and robust to parameter tuning, avoiding the associated expense and potential pitfalls with current practice of parameter tuning. Our extensive experimental results on 12 benchmark datasets show that our BKM without tuning any parameter can achieve comparable predictive performance with the state-of-the-art LIBSVM and significantly outperforms other baselines, while obtaining significantly speedup in terms of the total training time compared with its rivals.},
acmid = {3220015},
doi = {10.1145/3219819.3220015},
isbn = {978-1-4503-5552-0},
keywords = {Bayesian inference, big data, kernel methods, multiclass supervised learning, random feature, stein divergence, variational method},
location = {London, United Kingdom},
numpages = {9},
url = {http://doi.acm.org/10.1145/3219819.3220015},
}
ABSTRACT Kernel methods are powerful supervised machine learning models for their strong generalization ability, especially on limited data to effectively generalize on unseen data. However, most kernel methods, including the state-of-the-art LIBSVM, are vulnerable to the curse of kernelization, making them infeasible to apply to large-scale datasets. This issue is exacerbated when kernel methods are used in conjunction with a grid search to tune their kernel parameters and hyperparameters which brings in the question of model robustness when applied to real datasets. In this paper, we propose a robust Bayesian Kernel Machine (BKM) - a Bayesian kernel machine that exploits the strengths of both the Bayesian modelling and kernel methods. A key challenge for such a formulation is the need for an efficient learning algorithm. To this end, we successfully extended the recent Stein variational theory for Bayesian inference for our proposed model, resulting in fast and efficient learning and prediction algorithms. Importantly our proposed BKM is resilient to the curse of kernelization, hence making it applicable to large-scale datasets and robust to parameter tuning, avoiding the associated expense and potential pitfalls with current practice of parameter tuning. Our extensive experimental results on 12 benchmark datasets show that our BKM without tuning any parameter can achieve comparable predictive performance with the state-of-the-art LIBSVM and significantly outperforms other baselines, while obtaining significantly speedup in terms of the total training time compared with its rivals.

Shi, W., Zhang, A., & Webb, G. I.
International Journal of Geographical Information Science, 32(6), 1247-1270, 2018.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{doi:10.1080/13658816.2018.1434525,
Title = {Mining significant crisp-fuzzy spatial association rules},
Author = {Shi, Wenzhong and Zhang, Anshu and Webb, Geoffrey I.},
Journal = {International Journal of Geographical Information Science},
Year = {2018},
Number = {6},
Pages = {1247-1270},
Volume = {32},
Abstract = {Spatial association rule mining (SARM) is an important data mining task for understanding implicit and sophisticated interactions in spatial data. The usefulness of SARM results, represented as sets of rules, depends on their reliability: the abundance of rules, control over the risk of spurious rules, and accuracy of rule interestingness measure (RIM) values. This study presents crisp-fuzzy SARM, a novel SARM method that can enhance the reliability of resultant rules. The method firstly prunes dubious rules using statistically sound tests and crisp supports for the patterns involved, and then evaluates RIMs of accepted rules using fuzzy supports. For the RIM evaluation stage, the study also proposes a Gaussian-curve-based fuzzy data discretization model for SARM with improved design for spatial semantics. The proposed techniques were evaluated by both synthetic and real-world data. The synthetic data was generated with predesigned rules and RIM values, thus the reliability of SARM results could be confidently and quantitatively evaluated. The proposed techniques showed high efficacy in enhancing the reliability of SARM results in all three aspects. The abundance of resultant rules was improved by 50% or more compared with using conventional fuzzy SARM. Minimal risk of spurious rules was guaranteed by statistically sound tests. The probability that the entire result contained any spurious rules was below 1%. The RIM values also avoided large positive errors committed by crisp SARM, which typically exceeded 50% for representative RIMs. The real-world case study on New York City points of interest reconfirms the improved reliability of crisp-fuzzy SARM results, and demonstrates that such improvement is critical for practical spatial data analytics and decision support.},
Doi = {10.1080/13658816.2018.1434525},
Keywords = {Association Rule Discovery and statistically sound discovery},
Publisher = {Taylor \& Francis},
Related = {filtered-top-k-association-discovery},
Url = {http://www.tandfonline.com/eprint/aMdSMrAGuEHsHWSzIuqm/full}
}
ABSTRACT Spatial association rule mining (SARM) is an important data mining task for understanding implicit and sophisticated interactions in spatial data. The usefulness of SARM results, represented as sets of rules, depends on their reliability: the abundance of rules, control over the risk of spurious rules, and accuracy of rule interestingness measure (RIM) values. This study presents crisp-fuzzy SARM, a novel SARM method that can enhance the reliability of resultant rules. The method firstly prunes dubious rules using statistically sound tests and crisp supports for the patterns involved, and then evaluates RIMs of accepted rules using fuzzy supports. For the RIM evaluation stage, the study also proposes a Gaussian-curve-based fuzzy data discretization model for SARM with improved design for spatial semantics. The proposed techniques were evaluated by both synthetic and real-world data. The synthetic data was generated with predesigned rules and RIM values, thus the reliability of SARM results could be confidently and quantitatively evaluated. The proposed techniques showed high efficacy in enhancing the reliability of SARM results in all three aspects. The abundance of resultant rules was improved by 50% or more compared with using conventional fuzzy SARM. Minimal risk of spurious rules was guaranteed by statistically sound tests. The probability that the entire result contained any spurious rules was below 1%. The RIM values also avoided large positive errors committed by crisp SARM, which typically exceeded 50% for representative RIMs. The real-world case study on New York City points of interest reconfirms the improved reliability of crisp-fuzzy SARM results, and demonstrates that such improvement is critical for practical spatial data analytics and decision support.

Webb, G. I., Lee, L. K., Goethals, B., & Petitjean, F.
Data Mining and Knowledge Discovery, 32(5), 1179-1199, 2018.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{WebbEtAl18,
Title = {Analyzing concept drift and shift from sample data},
Author = {Webb, Geoffrey I and Lee, Loong Kuan and Goethals, Bart and Petitjean, Francois},
Journal = {Data Mining and Knowledge Discovery},
Year = {2018},
Number = {5},
Pages = {1179-1199},
Volume = {32},
Abstract = {Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We propose a new data mining task, concept drift mapping - the description and analysis of instances of concept drift or shift. We argue that concept drift mapping is an essential prerequisite for tackling concept drift and shift. We propose tools for this purpose, arguing for the importance of quantitative descriptions of drift and shift in marginal distributions. We present quantitative concept drift mapping techniques, along with methods for visualizing their results. We illustrate their effectiveness for real-world applications across energy-pricing, vegetation monitoring and airline scheduling.},
Doi = {10.1007/s10618-018-0554-1},
Keywords = {Concept Drift},
Related = {learning-from-non-stationary-distributions},
Url = {http://rdcu.be/IUTI}
}
ABSTRACT Concept drift and shift are major issues that greatly affect the accuracy and reliability of many real-world applications of machine learning. We propose a new data mining task, concept drift mapping - the description and analysis of instances of concept drift or shift. We argue that concept drift mapping is an essential prerequisite for tackling concept drift and shift. We propose tools for this purpose, arguing for the importance of quantitative descriptions of drift and shift in marginal distributions. We present quantitative concept drift mapping techniques, along with methods for visualizing their results. We illustrate their effectiveness for real-world applications across energy-pricing, vegetation monitoring and airline scheduling.

An, Y., Wang, J., Li, C., Leier, A., Marquez-Lago, T., Wilksch, J., Zhang, Y., Webb, G. I., Song, J., & Lithgow, T.
Briefings in Bioinformatics, 19(1), 148-161, 2018.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{AnEtAl2016,
author = {An, Yi and Wang, Jiawei and Li, Chen and Leier, Andre and Marquez-Lago, Tatiana and Wilksch, Jonathan and Zhang, Yang and Webb, Geoffrey I. and Song, Jiangning and Lithgow, Trevor},
journal = {Briefings in Bioinformatics},
title = {Comprehensive assessment and performance improvement of effector protein predictors for bacterial secretion systems III, IV and VI},
year = {2018},
number = {1},
pages = {148-161},
volume = {19},
abstract = {Bacterial effector proteins secreted by various protein secretion systems play crucial roles in host-pathogen interactions. In this context, computational tools capable of accurately predicting effector proteins of the various types of bacterial secretion systems are highly desirable. Existing computational approaches use different machine learning (ML) techniques and heterogeneous features derived from protein sequences and/or structural information. These predictors differ not only in terms of the used ML methods but also with respect to the used curated data sets, the features selection and their prediction performance. Here, we provide a comprehensive survey and benchmarking of currently available tools for the prediction of effector proteins of bacterial types III, IV and VI secretion systems (T3SS, T4SS and T6SS, respectively). We review core algorithms, feature selection techniques, tool availability and applicability and evaluate the prediction performance based on carefully curated independent test data sets. In an effort to improve predictive performance, we constructed three ensemble models based on ML algorithms by integrating the output of all individual predictors reviewed. Our benchmarks demonstrate that these ensemble models outperform all the reviewed tools for the prediction of effector proteins of T3SS and T4SS. The webserver of the proposed ensemble methods for T3SS and T4SS effector protein prediction is freely available at http://tbooster.erc.monash.edu/index.jsp. We anticipate that this survey will serve as a useful guide for interested users and that the new ensemble predictors will stimulate research into host-pathogen relationships and inspiration for the development of new bioinformatics tools for predicting effector proteins of T3SS, T4SS and T6SS.},
doi = {10.1093/bib/bbw100},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT Bacterial effector proteins secreted by various protein secretion systems play crucial roles in host-pathogen interactions. In this context, computational tools capable of accurately predicting effector proteins of the various types of bacterial secretion systems are highly desirable. Existing computational approaches use different machine learning (ML) techniques and heterogeneous features derived from protein sequences and/or structural information. These predictors differ not only in terms of the used ML methods but also with respect to the used curated data sets, the features selection and their prediction performance. Here, we provide a comprehensive survey and benchmarking of currently available tools for the prediction of effector proteins of bacterial types III, IV and VI secretion systems (T3SS, T4SS and T6SS, respectively). We review core algorithms, feature selection techniques, tool availability and applicability and evaluate the prediction performance based on carefully curated independent test data sets. In an effort to improve predictive performance, we constructed three ensemble models based on ML algorithms by integrating the output of all individual predictors reviewed. Our benchmarks demonstrate that these ensemble models outperform all the reviewed tools for the prediction of effector proteins of T3SS and T4SS. The webserver of the proposed ensemble methods for T3SS and T4SS effector protein prediction is freely available at http://tbooster.erc.monash.edu/index.jsp. We anticipate that this survey will serve as a useful guide for interested users and that the new ensemble predictors will stimulate research into host-pathogen relationships and inspiration for the development of new bioinformatics tools for predicting effector proteins of T3SS, T4SS and T6SS.

Zaidi, N. A., Webb, G. I., Petitjean, F., & Forestier, G.
arxiv, 1801.09354, 2018.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ZaidiEtAl18b,
Title = {On the Inter-Relationships among Drift Rate,
Forgetting Rate, Bias/Variance Profile and Error},
Author = {Zaidi, Nayyar A. and Webb, Geoffrey I. and Petitjean, Francois and Forestier, Germain},
Journal = {arxiv},
Year = {2018},
Pages = {1801.09354},
Abstract = {We propose two general and falsifiable hypotheses about expectations on generalization error when learning in the context of concept drift. One posits that as drift rate increases, the forgetting rate that minimizes generalization error will also increase and vice versa. The other posits that as a learner's forgetting rate increases, the bias/variance profile that minimizes generalization error will have lower variance and vice versa. These hypotheses lead to the concept of the sweet path, a path through the 3-d space of alternative drift rates, forgetting rates and bias/variance profiles on which generalization error will be minimized, such that slow drift is coupled with low forgetting and low bias, while rapid drift is coupled with fast forgetting and low variance. We present experiments that support the existence of such a sweet path. We also demonstrate that simple learners that select appropriate forgetting rates and bias/variance profiles are highly competitive with the state-of-the-art in incremental learners for concept drift on real-world drift problems.},
Keywords = {Concept Drift},
Related = {learning-from-non-stationary-distributions},
Url = {https://arxiv.org/abs/1801.09354}
}
ABSTRACT We propose two general and falsifiable hypotheses about expectations on generalization error when learning in the context of concept drift. One posits that as drift rate increases, the forgetting rate that minimizes generalization error will also increase and vice versa. The other posits that as a learner's forgetting rate increases, the bias/variance profile that minimizes generalization error will have lower variance and vice versa. These hypotheses lead to the concept of the sweet path, a path through the 3-d space of alternative drift rates, forgetting rates and bias/variance profiles on which generalization error will be minimized, such that slow drift is coupled with low forgetting and low bias, while rapid drift is coupled with fast forgetting and low variance. We present experiments that support the existence of such a sweet path. We also demonstrate that simple learners that select appropriate forgetting rates and bias/variance profiles are highly competitive with the state-of-the-art in incremental learners for concept drift on real-world drift problems.

Tan, C. W., Herrmann, M., Forestier, G., Webb, G. I., & Petitjean, F.
Proceedings of the 2018 SIAM International Conference on Data Mining, pp. 459-467, 2018.
Best Research Paper Award

@InProceedings{TanEtAl18,
author = {Tan, Chang Wei and Herrmann, Matthieu and Forestier, Germain and Webb, Geoffrey I. and Petitjean, Francois},
booktitle = {Proceedings of the 2018 {SIAM} International Conference on Data Mining},
title = {Efficient search of the best warping window for Dynamic Time Warping},
year = {2018},
pages = {459-467},
abstract = {Time series classification maps time series to labels. The nearest neighbour algorithm (NN) using the Dynamic Time Warping (DTW) similarity measure is a leading algorithm for this task and a component of the current best ensemble classifiers for time series. However, NN-DTW is only a winning combination when its meta-parameter - its warping window - is learned from the training data. The warping window (WW) intuitively controls the amount of distortion allowed when comparing a pair of time series. With a training database of N time series of lengths L, a naive approach to learning the WW requires Omega(N^2 . L^3) operations. This often translates in NN-DTW requiring days for training on datasets containing a few thousand time series only. In this paper, we introduce FastWWSearch: an efficient and exact method to learn WW. We show on 86 datasets that our method is always faster than the state of the art, with at least one order of magnitude and up to 1000x speed-up.},
comment = {Best Research Paper Award},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT Time series classification maps time series to labels. The nearest neighbour algorithm (NN) using the Dynamic Time Warping (DTW) similarity measure is a leading algorithm for this task and a component of the current best ensemble classifiers for time series. However, NN-DTW is only a winning combination when its meta-parameter - its warping window - is learned from the training data. The warping window (WW) intuitively controls the amount of distortion allowed when comparing a pair of time series. With a training database of N time series of lengths L, a naive approach to learning the WW requires Omega(N^2 . L^3) operations. This often translates in NN-DTW requiring days for training on datasets containing a few thousand time series only. In this paper, we introduce FastWWSearch: an efficient and exact method to learn WW. We show on 86 datasets that our method is always faster than the state of the art, with at least one order of magnitude and up to 1000x speed-up.

Tan, C. W., Webb, G. I., Petitjean, F., & Reichl, P.
Proceedings of the First International Conference on Rail Transportation (ICRT-17), 2018.
[Bibtex] [Abstract]  → Access on publisher site

@InProceedings{TanEtAl18b,
author = {Tan, Chang Wei and Webb, Geoffrey I. and Petitjean, Francois and Reichl, Paul},
booktitle = {Proceedings of the First International Conference on Rail Transportation (ICRT-17)},
title = {Tamping Effectiveness Prediction Using Supervised Machine Learning Techniques},
year = {2018},
publisher = {ACSE},
abstract = {Railway maintenance planning is critical in maintaining track assets. Tamping is a common railway maintenance procedure and is often used when geometrical issues are first identified. Tamping repacks ballast particles under sleepers to restore the correct geometrical position of ballasted tracks. However, historical data shows that tamping is not always effective in restoring track to a satisfactory condition. Furthermore, ineffective, or unnecessary tamping tends to reduce the lifetime of existing track. An intuitive way of preventing ineffective tamping is to predict the likely tamping effectiveness. This work aims to predict the likely tamping effectiveness ahead of time using supervised machine learning techniques. Supervised machine learning techniques predict an outcome using labelled training data. In this case, the training database consists of multivariate sensor data from instrumented revenue vehicles (IRVs). The data between the previous and current tamping dates are used. This forms a time series database labelled with the tamping effectiveness of each track location based on the responses recorded from the IRVs before and after tamping. The labelled time series database is then used to train a time series classifier for prediction. This work uses the state of the art time series classification algorithm, k-nearest neighbour (k-NN) extended to the case of multivariate time series. k-NN is a non-parametric algorithm that does not make assumptions on the underlying model of the training data. With a sufficiently large training database, non-parametric algorithms can outperform parametric algorithms. Using k-NN, the tamping effectiveness of a potential tamping location that is not in the training database, or locations in the next tamping cycle, is predicted using the expected tamping effectiveness from a location in the training database that is the most similar to the target. This allows the algorithm to effectively to identify locations where tamping is likely to be ineffective. This work achieves high accuracy in the prediction of tamping effectiveness even at 12 weeks before tamping. It is hoped that the methodology will help in assisting decision making for maintenance planning activities.},
doi = {10.1061/9780784481257.101},
}
ABSTRACT Railway maintenance planning is critical in maintaining track assets. Tamping is a common railway maintenance procedure and is often used when geometrical issues are first identified. Tamping repacks ballast particles under sleepers to restore the correct geometrical position of ballasted tracks. However, historical data shows that tamping is not always effective in restoring track to a satisfactory condition. Furthermore, ineffective, or unnecessary tamping tends to reduce the lifetime of existing track. An intuitive way of preventing ineffective tamping is to predict the likely tamping effectiveness. This work aims to predict the likely tamping effectiveness ahead of time using supervised machine learning techniques. Supervised machine learning techniques predict an outcome using labelled training data. In this case, the training database consists of multivariate sensor data from instrumented revenue vehicles (IRVs). The data between the previous and current tamping dates are used. This forms a time series database labelled with the tamping effectiveness of each track location based on the responses recorded from the IRVs before and after tamping. The labelled time series database is then used to train a time series classifier for prediction. This work uses the state of the art time series classification algorithm, k-nearest neighbour (k-NN) extended to the case of multivariate time series. k-NN is a non-parametric algorithm that does not make assumptions on the underlying model of the training data. With a sufficiently large training database, non-parametric algorithms can outperform parametric algorithms. Using k-NN, the tamping effectiveness of a potential tamping location that is not in the training database, or locations in the next tamping cycle, is predicted using the expected tamping effectiveness from a location in the training database that is the most similar to the target. This allows the algorithm to effectively to identify locations where tamping is likely to be ineffective. This work achieves high accuracy in the prediction of tamping effectiveness even at 12 weeks before tamping. It is hoped that the methodology will help in assisting decision making for maintenance planning activities.

Zaidi, N. A., & Webb, G. I.
Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 705-713, 2017.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ZaidiWebb17,
Title = {A Fast Trust-Region Newton Method for Softmax Logistic Regression},
Author = {Zaidi, Nayyar A. and Webb, Geoffrey I.},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {705-713},
Abstract = {With the emergence of big data, there has been a growing
interest in optimization routines that lead to faster convergence of Logistic Regression (LR). Among many optimization methods such as Gradient Descent, Quasi-Newton, Conjugate Gradient, etc., the Trust-region based truncated Newton method (TRON) algorithm has been shown to converge
the fastest. The TRON algorithm also forms an important
component of the highly efficient and widely used liblinear
package. It has been shown that the WANBIA-C trick of
scaling with the log of the naive Bayes conditional probabilities can greatly accelerate the convergence of LR trained using (first-order) Gradient Descent and (approximate second-order) Quasi-Newton optimization. In this work we study
the applicability of the WANBIA-C trick to TRON. We first
devise a TRON algorithm optimizing the softmax objective
function and then demonstrate that WANBIA-C style preconditioning can be beneficial for TRON, leading to an ex-
tremely fast (batch) LR algorithm. Second, we present a
comparative analysis of one-vs-all LR and softmax LR in
terms of the 0-1 Loss, Bias, Variance, RMSE, Log-Loss,
Training and Classication time, and show that softmax LR
leads to significantly better RMSE and Log-Loss. We evaluate our proposed approach on 51 benchmark datasets.},
Doi = {10.1137/1.9781611974973.79},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning}
}
ABSTRACT With the emergence of big data, there has been a growing interest in optimization routines that lead to faster convergence of Logistic Regression (LR). Among many optimization methods such as Gradient Descent, Quasi-Newton, Conjugate Gradient, etc., the Trust-region based truncated Newton method (TRON) algorithm has been shown to converge the fastest. The TRON algorithm also forms an important component of the highly efficient and widely used liblinear package. It has been shown that the WANBIA-C trick of scaling with the log of the naive Bayes conditional probabilities can greatly accelerate the convergence of LR trained using (first-order) Gradient Descent and (approximate second-order) Quasi-Newton optimization. In this work we study the applicability of the WANBIA-C trick to TRON. We first devise a TRON algorithm optimizing the softmax objective function and then demonstrate that WANBIA-C style preconditioning can be beneficial for TRON, leading to an ex- tremely fast (batch) LR algorithm. Second, we present a comparative analysis of one-vs-all LR and softmax LR in terms of the 0-1 Loss, Bias, Variance, RMSE, Log-Loss, Training and Classication time, and show that softmax LR leads to significantly better RMSE and Log-Loss. We evaluate our proposed approach on 51 benchmark datasets.

Tan, C. W., Webb, G. I., & Petitjean, F.
Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 282-290, 2017.
[Bibtex]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{TanEtAl17a,
Title = {Indexing and classifying gigabytes of time series under time warping},
Author = {Tan, Chang Wei and Webb, Geoffrey I. and Petitjean, Francois},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {282-290},
Doi = {10.1137/1.9781611974973.32},
Keywords = {time series},
Related = {scalable-time-series-classifiers}
}
ABSTRACT 

Song, J., Li, C., Zheng, C., Revote, J., Zhang, Z., & Webb, G. I.
Current Bioinformatics, 12(6), 480-489, 2017.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl16,
author = {Song, Jiangning and Li, Chen and Zheng, Cheng and Revote, Jerico and Zhang, Ziding and Webb, Geoffrey I.},
journal = {Current Bioinformatics},
title = {MetalExplorer, a Bioinformatics Tool for the Improved Prediction of Eight Types of Metal-binding Sites Using a Random Forest Algorithm with Two-step Feature Selection},
year = {2017},
issn = {1574-8936/2212-392X},
number = {6},
pages = {480-489},
volume = {12},
abstract = {Metalloproteins are highly involved in many biological processes,
including catalysis, recognition, transport, transcription, and signal
transduction. The metal ions they bind usually play enzymatic or structural
roles in mediating these diverse functional roles. Thus, the systematic
analysis and prediction of metal-binding sites using sequence and/or
structural information are crucial for understanding their
sequence-structure-function relationships. In this study, we propose
MetalExplorer (http://metalexplorer.erc.monash.edu.au/), a new machine
learning-based method for predicting eight different types of metal-binding
sites (Ca, Co, Cu, Fe, Ni, Mg, Mn, and Zn) in proteins. Our approach
combines heterogeneous sequence-, structure-, and residue contact
network-based features. The predictive performance of MetalExplorer was
tested by cross-validation and independent tests using non-redundant
datasets of known structures. This method applies a two-step feature
selection approach based on the maximum relevance minimum redundancy and
forward feature selection to identify the most informative features that
contribute to the prediction performance. With a precision of 60%,
MetalExplorer achieved high recall values, which ranged from 59% to 88% for
the eight metal ion types in fivefold cross-validation tests. Moreover, the
common and type-specific features in the optimal subsets of all metal ions
were characterized in terms of their contributions to the overall
performance. In terms of both benchmark and independent datasets at the 60%
precision control level, MetalExplorer compared favorably with an existing
metalloprotein prediction tool, SitePredict. Thus, MetalExplorer is expected
to be a powerful tool for the accurate prediction of potential metal-binding
sites and it should facilitate the functional analysis and rational design
of novel metalloproteins.},
doi = {10.2174/2468422806666160618091522},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT Metalloproteins are highly involved in many biological processes, including catalysis, recognition, transport, transcription, and signal transduction. The metal ions they bind usually play enzymatic or structural roles in mediating these diverse functional roles. Thus, the systematic analysis and prediction of metal-binding sites using sequence and/or structural information are crucial for understanding their sequence-structure-function relationships. In this study, we propose MetalExplorer (http://metalexplorer.erc.monash.edu.au/), a new machine learning-based method for predicting eight different types of metal-binding sites (Ca, Co, Cu, Fe, Ni, Mg, Mn, and Zn) in proteins. Our approach combines heterogeneous sequence-, structure-, and residue contact network-based features. The predictive performance of MetalExplorer was tested by cross-validation and independent tests using non-redundant datasets of known structures. This method applies a two-step feature selection approach based on the maximum relevance minimum redundancy and forward feature selection to identify the most informative features that contribute to the prediction performance. With a precision of 60%, MetalExplorer achieved high recall values, which ranged from 59% to 88% for the eight metal ion types in fivefold cross-validation tests. Moreover, the common and type-specific features in the optimal subsets of all metal ions were characterized in terms of their contributions to the overall performance. In terms of both benchmark and independent datasets at the 60% precision control level, MetalExplorer compared favorably with an existing metalloprotein prediction tool, SitePredict. Thus, MetalExplorer is expected to be a powerful tool for the accurate prediction of potential metal-binding sites and it should facilitate the functional analysis and rational design of novel metalloproteins.

Wang, Y., Song, J., Marquez-Lago, T. T., Leier, A., Li, C., Lithgow, T., Webb, G. I., & Shen, H.
Scientific Reports, 7, Art. no. 5755, 2017.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{WangYEtAl17,
Title = {Knowledge-transfer learning for prediction of matrix metalloprotease substrate-cleavage sites},
Author = {Wang, Yanan and Song, Jiangning and Marquez-Lago, Tatiana T. and Leier, Andre and Li, Chen and Lithgow, Trevor and Webb, Geoffrey I. and Shen, Hong-Bin},
Journal = {Scientific Reports},
Year = {2017},
Volume = {7},
Articlenumber = {5755},
Doi = {10.1038/s41598-017-06219-7},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT 

Song, J., Li, F., Leier, A., Marquez-Lago, T. T., Akutsu, T., Haffari, G., Chou, K., Webb, G. I., & Pike, R. N.
Bioinformatics, 34(4), 684-687, 2017.
Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{Song2017a,
author = {Song, Jiangning and Li, Fuyi and Leier, Andre and Marquez-Lago, Tatiana T and Akutsu, Tatsuya and Haffari, Gholamreza and Chou, Kuo-Chen and Webb, Geoffrey I and Pike, Robert N},
journal = {Bioinformatics},
title = {PROSPERous: high-throughput prediction of substrate cleavage sites for 90 proteases with improved accuracy},
year = {2017},
number = {4},
pages = {684-687},
volume = {34},
comment = {Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021},
doi = {10.1093/bioinformatics/btx670},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT 

Wang, J., Yang, B., Revote, J., Leier, A., Marquez-Lago, T. T., Webb, G. I., Song, J., Chou, K., & Lithgow, T.
Bioinformatics, 33(17), 2756-2758, 2017.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{WangJEtAl17,
Title = {POSSUM: a bioinformatics toolkit for generating numerical sequence feature descriptors based on PSSM profiles},
Author = {Wang, Jiawei and Yang, Bingjiao and Revote, Jerico and Leier, Andre and Marquez-Lago, Tatiana T. and Webb, Geoffrey I. and Song, Jiangning and Chou, Kuo-Chen and Lithgow, Trevor},
Journal = {Bioinformatics},
Year = {2017},
Number = {17},
Pages = {2756-2758},
Volume = {33},
Doi = {10.1093/bioinformatics/btx302},
Keywords = {Bioinformatics},
Related = {computational-biology}
}
ABSTRACT 

Song, J., Wang, H., Wang, J., Leier, A., Marquez-Lago, T., Yang, B., Zhang, Z., Akutsu, T., Webb, G. I., & Daly, R. J.
Scientific Reports, 7(1), Art. no. 6862, 2017.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Song2017,
Title = {PhosphoPredict: A bioinformatics tool for prediction of human kinase-specific phosphorylation substrates and sites by integrating heterogeneous feature selection},
Author = {Song, Jiangning and Wang, Huilin and Wang, Jiawei and Leier, Andre and Marquez-Lago, Tatiana and Yang, Bingjiao and Zhang, Ziding and Akutsu, Tatsuya and Webb, Geoffrey I. and Daly, Roger J.},
Journal = {Scientific Reports},
Year = {2017},
Number = {1},
Volume = {7},
Abstract = {Protein phosphorylation is a major form of post-translational modification (PTM) that regulates diverse cellular processes. In silico methods for phosphorylation site prediction can provide a useful and complementary strategy for complete phosphoproteome annotation. Here, we present a novel bioinformatics tool, PhosphoPredict, that combines protein sequence and functional features to predict kinase-specific substrates and their associated phosphorylation sites for 12 human kinases and kinase families, including ATM, CDKs, GSK-3, MAPKs, PKA, PKB, PKC, and SRC. To elucidate critical determinants, we identified feature subsets that were most informative and relevant for predicting substrate specificity for each individual kinase family. Extensive benchmarking experiments based on both five-fold cross-validation and independent tests indicated that the performance of PhosphoPredict is competitive with that of several other popular prediction tools, including KinasePhos, PPSP, GPS, and Musite. We found that combining protein functional and sequence features significantly improves phosphorylation site prediction performance across all kinases. Application of PhosphoPredict to the entire human proteome identified 150 to 800 potential phosphorylation substrates for each of the 12 kinases or kinase families. PhosphoPredict significantly extends the bioinformatics portfolio for kinase function analysis and will facilitate high-throughput identification of kinase-specific phosphorylation sites, thereby contributing to both basic and translational research programs.},
Articlenumber = {6862},
Doi = {10.1038/s41598-017-07199-4},
Keywords = {Bioinformatics},
Related = {computational-biology}
}
ABSTRACT Protein phosphorylation is a major form of post-translational modification (PTM) that regulates diverse cellular processes. In silico methods for phosphorylation site prediction can provide a useful and complementary strategy for complete phosphoproteome annotation. Here, we present a novel bioinformatics tool, PhosphoPredict, that combines protein sequence and functional features to predict kinase-specific substrates and their associated phosphorylation sites for 12 human kinases and kinase families, including ATM, CDKs, GSK-3, MAPKs, PKA, PKB, PKC, and SRC. To elucidate critical determinants, we identified feature subsets that were most informative and relevant for predicting substrate specificity for each individual kinase family. Extensive benchmarking experiments based on both five-fold cross-validation and independent tests indicated that the performance of PhosphoPredict is competitive with that of several other popular prediction tools, including KinasePhos, PPSP, GPS, and Musite. We found that combining protein functional and sequence features significantly improves phosphorylation site prediction performance across all kinases. Application of PhosphoPredict to the entire human proteome identified 150 to 800 potential phosphorylation substrates for each of the 12 kinases or kinase families. PhosphoPredict significantly extends the bioinformatics portfolio for kinase function analysis and will facilitate high-throughput identification of kinase-specific phosphorylation sites, thereby contributing to both basic and translational research programs.

Hamalainen, W., & Webb, G. I.
Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 309-317, 2017.

@InProceedings{HamalainenWebb17,
Title = {Specious rules: an efficient and effective unifying method for removing misleading and uninformative patterns in association rule mining},
Author = {Hamalainen, Wilhelmiina and Webb, Geoffrey I},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {309-317},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Related = {statistically-sound-association-discovery}
}
ABSTRACT 

Zaidi, N., Webb, G. I., Carman, M., Petitjean, F., Buntine, W., Hynes, H., & De Sterck, H.
Machine Learning, 106(9-10), 1289-1329, 2017.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{ZaidiEtAl17,
Title = {Efficient Parameter Learning of Bayesian Network Classifiers},
Author = {Zaidi, N. and Webb, Geoffrey I. and Carman, M. and Petitjean, F. and Buntine, W. and Hynes, H. and De Sterck, H.},
Journal = {Machine Learning},
Year = {2017},
Number = {9-10},
Pages = {1289-1329},
Volume = {106},
Abstract = {Recent advances have demonstrated substantial benefits from learning with both generative and discriminative parameters. On the one hand, generative approaches address the estimation of the parameters of the joint distribution�P(y,x), which for most network types is very computationally efficient (a notable exception to this are Markov networks) and on the other hand, discriminative approaches address the estimation of the parameters of the posterior distribution�and, are more effective for classification, since they fit P(y|x) directly. However, discriminative approaches are less computationally efficient as the normalization factor in the conditional log-likelihood precludes the derivation of closed-form estimation of parameters. This paper introduces a new discriminative parameter learning method for Bayesian network classifiers that combines in an elegant fashion parameters learned using both generative and discriminative methods. The proposed method is discriminative in nature, but uses estimates of generative probabilities to speed-up the optimization process. A second contribution is to propose a simple framework to characterize the parameter learning task for Bayesian network classifiers. We conduct an extensive set of experiments on 72 standard datasets and demonstrate that our proposed discriminative parameterization provides an efficient alternative to other state-of-the-art parameterizations.},
Doi = {10.1007/s10994-016-5619-z},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://rdcu.be/oP1t}
}
ABSTRACT Recent advances have demonstrated substantial benefits from learning with both generative and discriminative parameters. On the one hand, generative approaches address the estimation of the parameters of the joint distribution�P(y,x), which for most network types is very computationally efficient (a notable exception to this are Markov networks) and on the other hand, discriminative approaches address the estimation of the parameters of the posterior distribution�and, are more effective for classification, since they fit P(y|x) directly. However, discriminative approaches are less computationally efficient as the normalization factor in the conditional log-likelihood precludes the derivation of closed-form estimation of parameters. This paper introduces a new discriminative parameter learning method for Bayesian network classifiers that combines in an elegant fashion parameters learned using both generative and discriminative methods. The proposed method is discriminative in nature, but uses estimates of generative probabilities to speed-up the optimization process. A second contribution is to propose a simple framework to characterize the parameter learning task for Bayesian network classifiers. We conduct an extensive set of experiments on 72 standard datasets and demonstrate that our proposed discriminative parameterization provides an efficient alternative to other state-of-the-art parameterizations.

Forestier, G., Petitjean, F., Dau, H. A., Webb, G. I., & Keogh, E.
IEEE International Conference on Data Mining (ICDM-17), pp. 865-870, 2017.

@InProceedings{ForestierEtAl17,
Title = {Generating synthetic time series to augment sparse datasets},
Author = {Forestier, Germain and Petitjean, Francois and Dau, Hoang Anh and Webb, Geoffrey I and Keogh, Eamonn},
Booktitle = {IEEE International Conference on Data Mining (ICDM-17)},
Year = {2017},
Pages = {865-870},
Keywords = {time series},
Related = {scalable-time-series-classifiers}
}
ABSTRACT 

Fernando, T. L., & Webb, G. I.
Data Mining and Knowledge Discovery, 31(1), 264-286, 2017.

@Article{FernandoWebb16,
Title = {SimUSF: an efficient and effective similarity measure that is invariant to violations of the interval scale assumption},
Author = {Fernando, Thilak L. and Webb, Geoffrey I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2017},
Number = {1},
Pages = {264-286},
Volume = {31},
Abstract = {Similarity measures are central to many machine learning algorithms. There are many different similarity measures, each catering for different applications and data requirements. Most similarity measures used with numerical data assume that the attributes are interval scale. In the interval scale, it is assumed that a unit difference has the same meaning irrespective of the magnitudes of the values separated. When this assumption is violated, accuracy may be reduced. Our experiments show that removing the interval scale assumption by transforming data to ranks can improve the accuracy of distance-based similarity measures on some tasks. However the rank transform has high time and storage overheads. In this paper, we introduce an efficient similarity measure which does not consider the magnitudes of inter-instance distances. We compare the new similarity measure with popular similarity measures in two applications: DBScan clustering and content based multimedia information retrieval with real world datasets and different transform functions. The results show that the proposed similarity measure provides good performance on a range of tasks and is invariant to violations of the interval scale assumption.},
Doi = {10.1007/s10618-016-0463-0}
}
ABSTRACT Similarity measures are central to many machine learning algorithms. There are many different similarity measures, each catering for different applications and data requirements. Most similarity measures used with numerical data assume that the attributes are interval scale. In the interval scale, it is assumed that a unit difference has the same meaning irrespective of the magnitudes of the values separated. When this assumption is violated, accuracy may be reduced. Our experiments show that removing the interval scale assumption by transforming data to ranks can improve the accuracy of distance-based similarity measures on some tasks. However the rank transform has high time and storage overheads. In this paper, we introduce an efficient similarity measure which does not consider the magnitudes of inter-instance distances. We compare the new similarity measure with popular similarity measures in two applications: DBScan clustering and content based multimedia information retrieval with real world datasets and different transform functions. The results show that the proposed similarity measure provides good performance on a range of tasks and is invariant to violations of the interval scale assumption.

Encyclopedia of Machine Learning and Data Mining
Sammut, C., & Webb, G. I. (Ed).
Berlin: Springer, 2017.
[Bibtex]  → Access on publisher site

@Book{SammutWebb17,
Title = {Encyclopedia of Machine Learning and Data Mining},
Editor = {Sammut, C. and Webb, G.I.},
Publisher = {Springer},
Year = {2017},
Doi = {10.1007/978-1-4899-7502-7}
}
ABSTRACT 

An, Y., Wang, J., Li, C., Revote, J., Zhang, Y., Naderer, T., Hayashida, M., Akutsu, T., Webb, G. I., Lithgow, T., & Song, J.
Scientific Reports, 7, Art. no. 41031, 2017.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{AnEtAl17,
Title = {SecretEPDB: a comprehensive web-based resource for secreted effector proteins of the bacterial types III, IV and VI secretion systems},
Author = {An, Yi and Wang, Jiawei and Li, Chen and Revote, Jerico and Zhang, Yang and Naderer, Thomas and Hayashida, Mirohiro and Akutsu, Tatsuya and Webb, Geoffrey I. and Lithgow, Trevor and Song, Jiangning},
Journal = {Scientific Reports},
Year = {2017},
Volume = {7},
Articlenumber = {41031},
Doi = {10.1038/srep41031},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://rdcu.be/oJ9I}
}
ABSTRACT 

Ananda-Rajah, M. R., Bergmeir, C., Petitjean, F., Slavin, M. A., Thursky, K. A., & Webb, G. I.
JCO Clinical Cancer Informatics(1), 1-10, 2017.
[Bibtex] [Abstract]  → Access on publisher site

@Article{Ananda-RajahEtAl17,
author = {Ananda-Rajah, Michelle R. and Bergmeir, Christoph and Petitjean, Francois and Slavin, Monica A. and Thursky, Karin A. and Webb, Geoffrey I.},
journal = {JCO Clinical Cancer Informatics},
title = {Toward Electronic Surveillance of Invasive Mold Diseases in Hematology-Oncology Patients: An Expert System Combining Natural Language Processing of Chest Computed Tomography Reports, Microbiology, and Antifungal Drug Data},
year = {2017},
number = {1},
pages = {1-10},
abstract = {Prospective epidemiologic surveillance of invasive mold disease (IMD) in hematology patients is hampered by the absence of a reliable laboratory prompt. This study develops an expert system for electronic surveillance of IMD that combines probabilities using natural language processing (NLP) of computed tomography (CT) reports with microbiology and antifungal drug data to improve prediction of IMD.MethodsMicrobiology indicators and antifungal drug dispensing data were extracted from hospital information systems at three tertiary hospitals for 123 hematology-oncology patients. Of this group, 64 case patients had 26 probable/proven IMD according to international definitions, and 59 patients were uninfected controls. Derived probabilities from NLP combined with medical expertise identified patients at high likelihood of IMD, with remaining patients processed by a machine-learning classifier trained on all available features. Results Compared with the baseline text classifier, the expert system that incorporated the best performing algorithm (naive Bayes) improved specificity from 50.8\% (95\% CI, 37.5\% to 64.1\%) to 74.6\% (95\% CI, 61.6\% to 85.0\%), reducing false positives by 48\% from 29 to 15; improved sensitivity slightly from 96.9\% (95\% CI, 89.2\% to 99.6\%) to 98.4\% (95\% CI, 91.6\% to 100\%); and improved receiver operating characteristic area from 73.9\% (95\% CI, 67.1\% to 80.6\%) to 92.8\% (95\% CI, 88\% to 97.5\%). Conclusion An expert system that uses multiple sources of data (CT reports, microbiology, antifungal drug dispensing) is a promising approach to continuous prospective surveillance of IMD in the hospital, and demonstrates reduced false notifications (positives) compared with NLP of CT reports alone. Our expert system could provide decision support for IMD surveillance, which is critical to antifungal stewardship and improving supportive care in cancer.},
doi = {10.1200/CCI.17.00011},
eprint = {https://doi.org/10.1200/CCI.17.00011},
owner = {giwebb},
timestamp = {2017.09.07},
url = {https://doi.org/10.1200/CCI.17.00011},
}
ABSTRACT Prospective epidemiologic surveillance of invasive mold disease (IMD) in hematology patients is hampered by the absence of a reliable laboratory prompt. This study develops an expert system for electronic surveillance of IMD that combines probabilities using natural language processing (NLP) of computed tomography (CT) reports with microbiology and antifungal drug data to improve prediction of IMD.MethodsMicrobiology indicators and antifungal drug dispensing data were extracted from hospital information systems at three tertiary hospitals for 123 hematology-oncology patients. Of this group, 64 case patients had 26 probable/proven IMD according to international definitions, and 59 patients were uninfected controls. Derived probabilities from NLP combined with medical expertise identified patients at high likelihood of IMD, with remaining patients processed by a machine-learning classifier trained on all available features. Results Compared with the baseline text classifier, the expert system that incorporated the best performing algorithm (naive Bayes) improved specificity from 50.8\% (95\% CI, 37.5\% to 64.1\%) to 74.6\% (95\% CI, 61.6\% to 85.0\%), reducing false positives by 48\% from 29 to 15; improved sensitivity slightly from 96.9\% (95\% CI, 89.2\% to 99.6\%) to 98.4\% (95\% CI, 91.6\% to 100\%); and improved receiver operating characteristic area from 73.9\% (95\% CI, 67.1\% to 80.6\%) to 92.8\% (95\% CI, 88\% to 97.5\%). Conclusion An expert system that uses multiple sources of data (CT reports, microbiology, antifungal drug dispensing) is a promising approach to continuous prospective surveillance of IMD in the hospital, and demonstrates reduced false notifications (positives) compared with NLP of CT reports alone. Our expert system could provide decision support for IMD surveillance, which is critical to antifungal stewardship and improving supportive care in cancer.

Chen, S., Martinez, A. M., Webb, G. I., & Wang, L.
Knowledge and Information Systems, 50(2), 475-503, 2017.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{ChenEtAl16,
Title = {Selective AnDE for large data learning: a low-bias memory constrained approach},
Author = {Chen, Shenglei and Martinez, Ana M. and Webb, Geoffrey I. and Wang, Limin},
Journal = {Knowledge and Information Systems},
Year = {2017},
Number = {2},
Pages = {475-503},
Volume = {50},
Abstract = {Learning from data that are too big to fit into memory poses great challenges to currently available learning approaches. Averaged n-Dependence Estimators (AnDE) allows for a flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence, AnDE is especially appropriate for learning from large quantities of data. Memory requirement in AnDE, however, increases combinatorially with the number of attributes and the parameter n. In large data learning, number of attributes is often large and we also expect high n to achieve low-bias classification. In order to achieve the lower bias of AnDE with higher n but with less memory requirement, we propose a memory constrained selective AnDE algorithm, in which two passes of learning through training examples are involved. The first pass performs attribute selection on super parents according to available memory, whereas the second one learns an AnDE model with parents only on the selected attributes. Extensive experiments show that the new selective AnDE has considerably lower bias and prediction error relative to A \$\$n'\$\$ n {\textasciiacutex} DE, where \$\$n' = n-1\$\$ n {\textasciiacutex} = n - 1 , while maintaining the same space complexity and similar time complexity. The proposed algorithm works well on categorical data. Numerical data sets need to be discretized first.},
Doi = {10.1007/s10115-016-0937-9},
ISSN = {0219-3116},
Keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Learning from data that are too big to fit into memory poses great challenges to currently available learning approaches. Averaged n-Dependence Estimators (AnDE) allows for a flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence, AnDE is especially appropriate for learning from large quantities of data. Memory requirement in AnDE, however, increases combinatorially with the number of attributes and the parameter n. In large data learning, number of attributes is often large and we also expect high n to achieve low-bias classification. In order to achieve the lower bias of AnDE with higher n but with less memory requirement, we propose a memory constrained selective AnDE algorithm, in which two passes of learning through training examples are involved. The first pass performs attribute selection on super parents according to available memory, whereas the second one learns an AnDE model with parents only on the selected attributes. Extensive experiments show that the new selective AnDE has considerably lower bias and prediction error relative to A \$\$n'\$\$ n {\textasciiacutex} DE, where \$\$n' = n-1\$\$ n {\textasciiacutex} = n - 1 , while maintaining the same space complexity and similar time complexity. The proposed algorithm works well on categorical data. Numerical data sets need to be discretized first.

Chen, S., Martinez, A., Webb, G., & Wang, L.
IEEE Transactions on Knowledge and Data Engineering, 29(1), 172-185, 2017.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{ChenEtAl16b,
author = {Chen, S. and Martinez, A. and Webb, G. and Wang, L.},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
title = {Sample-based Attribute Selective AnDE for Large Data},
year = {2017},
issn = {1041-4347},
number = {1},
pages = {172-185},
volume = {29},
abstract = {More and more applications come with large data sets in the past decade. However, existing algorithms cannot guarantee to scale well on large data. Averaged n-Dependence Estimators (AnDE) allows for flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence AnDE is especially appropriate for large data learning. In this paper, we propose a sample-based attribute selection technique for AnDE. It needs one more pass through the training data, in which a multitude of approximate AnDE models are built and efficiently assessed by leave-one-out cross validation. The use of a sample reduces the training time. Experiments on 15 large data sets demonstrate that the proposed technique significantly reduces AnDE�s error at the cost of a modest increase in training time. This efficient and scalable out-of-core approach delivers superior or comparable performance to typical in-core Bayesian network classifiers.},
doi = {10.1109/TKDE.2016.2608881},
keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT More and more applications come with large data sets in the past decade. However, existing algorithms cannot guarantee to scale well on large data. Averaged n-Dependence Estimators (AnDE) allows for flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence AnDE is especially appropriate for large data learning. In this paper, we propose a sample-based attribute selection technique for AnDE. It needs one more pass through the training data, in which a multitude of approximate AnDE models are built and efficiently assessed by leave-one-out cross validation. The use of a sample reduces the training time. Experiments on 15 large data sets demonstrate that the proposed technique significantly reduces AnDE�s error at the cost of a modest increase in training time. This efficient and scalable out-of-core approach delivers superior or comparable performance to typical in-core Bayesian network classifiers.

Bergmeir, C., Bilgrami, I., Bain, C., Webb, G. I., Orosz, J., & Pilcher, D.
PLoS ONE, 12(12), Art. no. e0188688, 2017.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{BergmeirEtAl2017,
author = {Bergmeir, Christoph and Bilgrami, Irma and Bain, Christopher and Webb, Geoffrey I and Orosz, Judit and Pilcher, David},
journal = {PLoS ONE},
title = {Designing a more efficient, effective and safe Medical Emergency Team (MET) service using data analysis},
year = {2017},
number = {12},
volume = {12},
articlenumber = {e0188688},
doi = {10.1371/journal.pone.0188688},
keywords = {health},
related = {health},
}
ABSTRACT 

Petitjean, F., Li, T., Tatti, N., & Webb, G. I.
Data Mining and Knowledge Discovery, 30(5), 1086-1111, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{PetitjeanEtAl16b,
Title = {Skopus: Mining top-k sequential patterns under leverage},
Author = {Petitjean, Francois and Li, Tao and Tatti, Nikolaj and Webb, Geoffrey I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2016},
Number = {5},
Pages = {1086-1111},
Volume = {30},
Abstract = {This paper presents a framework for exact discovery of the top-k sequential patterns under Leverage. It combines (1) a novel definition of the expected support for a sequential pattern---a concept on which most interestingness measures directly rely---with (2) Skopus: a new branch-and-bound algorithm for the exact discovery of top-k sequential patterns under a given measure of interest. Our interestingness measure employs the partition approach. A pattern is interesting to the extent that it is more frequent than can be explained by assuming independence between any of the pairs of patterns from which it can be composed. The larger the support compared to the expectation under independence, the more interesting is the pattern. We build on these two elements to exactly extract the k sequential patterns with highest leverage, consistent with our definition of expected support. We conduct experiments on both synthetic data with known patterns and real-world datasets; both experiments confirm the consistency and relevance of our approach with regard to the state of the art.},
Doi = {10.1007/s10618-016-0467-9},
ISSN = {1573-756X},
Keywords = {OPUS and Association Rule Discovery and statistically sound discovery},
Related = {statistically-sound-association-discovery},
Url = {http://rdcu.be/tsDo}
}
ABSTRACT This paper presents a framework for exact discovery of the top-k sequential patterns under Leverage. It combines (1) a novel definition of the expected support for a sequential pattern–-a concept on which most interestingness measures directly rely–-with (2) Skopus: a new branch-and-bound algorithm for the exact discovery of top-k sequential patterns under a given measure of interest. Our interestingness measure employs the partition approach. A pattern is interesting to the extent that it is more frequent than can be explained by assuming independence between any of the pairs of patterns from which it can be composed. The larger the support compared to the expectation under independence, the more interesting is the pattern. We build on these two elements to exactly extract the k sequential patterns with highest leverage, consistent with our definition of expected support. We conduct experiments on both synthetic data with known patterns and real-world datasets; both experiments confirm the consistency and relevance of our approach with regard to the state of the art.

Li, F., Li, C., Revote, J., Zhang, Y., Webb, G. I., Li, J., Song, J., & Lithgow, T.
Scientific Reports, 6, Art. no. 34595, 2016.
[Bibtex]  → Access on publisher site  → Related papers and software

@Article{LiEtAl16,
Title = {GlycoMinestruct: a new bioinformatics tool for highly accurate mapping of the human N-linked and O-linked glycoproteomes by incorporating structural features},
Author = {Li, Fuyi and Li, Chen and Revote, Jerico and Zhang, Yang and Webb, Geoffrey I. and Li, Jian and Song, Jiangning and Lithgow, Trevor},
Journal = {Scientific Reports},
Year = {2016},
Month = oct,
Volume = {6},
Articlenumber = {34595},
Doi = {10.1038/srep34595},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT 

Martinez, A. M., Webb, G. I., Chen, S., & Zaidi, N. A.
Journal of Machine Learning Research, 17(44), 1-35, 2016.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{MartinezEtAl16,
author = {Martinez, Ana M. and Webb, Geoffrey I. and Chen, Shenglei and Zaidi, Nayyar A.},
journal = {Journal of Machine Learning Research},
title = {Scalable Learning of {Bayesian} Network Classifiers},
year = {2016},
number = {44},
pages = {1-35},
volume = {17},
abstract = {Ever increasing data quantity makes ever more urgent the need for highly scalable learners that have good classification performance. Therefore, an out-of-core learner with excellent time and space complexity, along with high expressivity (that is, capacity to learn very complex multivariate probability distributions) is extremely desirable. This paper presents such a learner. We propose an extension to the k-dependence Bayesian classifier (KDB) that discriminatively selects a sub- model of a full KDB classifier. It requires only one additional pass through the training data, making it a three-pass learner. Our extensive experimental evaluation on 16 large data sets reveals that this out-of-core algorithm achieves competitive classification performance, and substantially better training and classification time than state-of-the-art in-core learners such as random forest and linear and non-linear logistic regression.},
keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
related = {learning-complex-conditional-probabilities-from-data},
url = {http://jmlr.org/papers/v17/martinez16a.html},
}
ABSTRACT Ever increasing data quantity makes ever more urgent the need for highly scalable learners that have good classification performance. Therefore, an out-of-core learner with excellent time and space complexity, along with high expressivity (that is, capacity to learn very complex multivariate probability distributions) is extremely desirable. This paper presents such a learner. We propose an extension to the k-dependence Bayesian classifier (KDB) that discriminatively selects a sub- model of a full KDB classifier. It requires only one additional pass through the training data, making it a three-pass learner. Our extensive experimental evaluation on 16 large data sets reveals that this out-of-core algorithm achieves competitive classification performance, and substantially better training and classification time than state-of-the-art in-core learners such as random forest and linear and non-linear logistic regression.

Chang, C. C. H., Li, C., Webb, G. I., Tey, B., & Song, J.
Scientific Reports, 6, Art. no. 21844, 2016.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ChangEtAl2016,
Title = {Periscope: quantitative prediction of soluble protein expression in the periplasm of Escherichia coli},
Author = {Chang, C.C.H. and Li, C. and Webb, G. I. and Tey, B. and Song, J.},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
Abstract = {Periplasmic expression of soluble proteins in Escherichia coli not only offers a much-simplified downstream purification process, but also enhances the probability of obtaining correctly folded and biologically active proteins. Different combinations of signal peptides and target proteins lead to different soluble protein expression levels, ranging from negligible to several grams per litre. Accurate algorithms for rational selection of promising candidates can serve as a powerful tool to complement with current trial-and-error approaches. Accordingly, proteomics studies can be conducted with greater efficiency and cost-effectiveness. Here, we developed a predictor with a two-stage architecture, to predict the real-valued expression level of target protein in the periplasm. The output of the first-stage support vector machine (SVM) classifier determines which second-stage support vector regression (SVR) classifier to be used. When tested on an independent test dataset, the predictor achieved an overall prediction accuracy of 78% and a Pearson’s correlation coefficient (PCC) of 0.77. We further illustrate the relative importance of various features with respect to different models. The results indicate that the occurrence of dipeptide glutamine and aspartic acid is the most important feature for the classification model. Finally, we provide access to the implemented predictor through the Periscope webserver, freely accessible at http://lightning.med.monash.edu/periscope/.},
Articlenumber = {21844},
Doi = {10.1038/srep21844},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT Periplasmic expression of soluble proteins in Escherichia coli not only offers a much-simplified downstream purification process, but also enhances the probability of obtaining correctly folded and biologically active proteins. Different combinations of signal peptides and target proteins lead to different soluble protein expression levels, ranging from negligible to several grams per litre. Accurate algorithms for rational selection of promising candidates can serve as a powerful tool to complement with current trial-and-error approaches. Accordingly, proteomics studies can be conducted with greater efficiency and cost-effectiveness. Here, we developed a predictor with a two-stage architecture, to predict the real-valued expression level of target protein in the periplasm. The output of the first-stage support vector machine (SVM) classifier determines which second-stage support vector regression (SVR) classifier to be used. When tested on an independent test dataset, the predictor achieved an overall prediction accuracy of 78% and a Pearson’s correlation coefficient (PCC) of 0.77. We further illustrate the relative importance of various features with respect to different models. The results indicate that the occurrence of dipeptide glutamine and aspartic acid is the most important feature for the classification model. Finally, we provide access to the implemented predictor through the Periscope webserver, freely accessible at http://lightning.med.monash.edu/periscope/.

Petitjean, F., Forestier, G., Webb, G. I., Nicholson, A. E., Chen, Y., & Keogh, E.
Knowledge and Information Systems, 47(1), 1-26, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{PetitjeanEtAl16a,
author = {Petitjean, F. and Forestier, G. and Webb, G. I. and Nicholson, A. E. and Chen, Y. and Keogh, E.},
journal = {Knowledge and Information Systems},
title = {Faster and more accurate classification of time series by exploiting a novel dynamic time warping averaging algorithm},
year = {2016},
number = {1},
pages = {1-26},
volume = {47},
abstract = {A concerted research effort over the past two decades has heralded significant improvements in both the efficiency and effectiveness of time series classification. The consensus that has emerged in the community is that the best solution is a surprisingly simple one. In virtually all domains, the most accurate classifier is the nearest neighbor algorithm with dynamic time warping as the distance measure. The time complexity of dynamic time warping means that successful deployments on resource-constrained devices remain elusive. Moreover, the recent explosion of interest in wearable computing devices, which typically have limited computational resources, has greatly increased the need for very efficient classification algorithms. A classic technique to obtain the benefits of the nearest neighbor algorithm, without inheriting its undesirable time and space complexity, is to use the nearest centroid algorithm. Unfortunately, the unique properties of (most) time series data mean that the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this paper we demonstrate that we can exploit a recent result by Petitjean et al. to allow meaningful averaging of “warped�? time series, which then allows us to create super-efficient nearest “centroid�? classifiers that are at least as accurate as their more computationally challenged nearest neighbor relatives. We demonstrate empirically the utility of our approach by comparing it to all the appropriate strawmen algorithms on the ubiquitous UCR Benchmarks and with a case study in supporting insect classification on resource-constrained sensors.},
doi = {10.1007/s10115-015-0878-8},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT A concerted research effort over the past two decades has heralded significant improvements in both the efficiency and effectiveness of time series classification. The consensus that has emerged in the community is that the best solution is a surprisingly simple one. In virtually all domains, the most accurate classifier is the nearest neighbor algorithm with dynamic time warping as the distance measure. The time complexity of dynamic time warping means that successful deployments on resource-constrained devices remain elusive. Moreover, the recent explosion of interest in wearable computing devices, which typically have limited computational resources, has greatly increased the need for very efficient classification algorithms. A classic technique to obtain the benefits of the nearest neighbor algorithm, without inheriting its undesirable time and space complexity, is to use the nearest centroid algorithm. Unfortunately, the unique properties of (most) time series data mean that the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this paper we demonstrate that we can exploit a recent result by Petitjean et al. to allow meaningful averaging of “warped�? time series, which then allows us to create super-efficient nearest “centroid�? classifiers that are at least as accurate as their more computationally challenged nearest neighbor relatives. We demonstrate empirically the utility of our approach by comparing it to all the appropriate strawmen algorithms on the ubiquitous UCR Benchmarks and with a case study in supporting insect classification on resource-constrained sensors.

Webb, G. I., Hyde, R., Cao, H., Nguyen, H. L., & Petitjean, F.
Data Mining and Knowledge Discovery, 30(4), 964-994, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{WebbEtAl16,
author = {Webb, G. I. and Hyde, R. and Cao, H. and Nguyen, H. L. and Petitjean, F.},
journal = {Data Mining and Knowledge Discovery},
title = {Characterizing Concept Drift},
year = {2016},
number = {4},
pages = {964-994},
volume = {30},
abstract = {Most machine learning models are static, but the world is dynamic, and increasing online deployment of learned models gives increasing urgency to the development of efficient and effective mechanisms to address learning in the context of non-stationary distributions, or as it is commonly called concept drift. However, the key issue of characterizing the different types of drift that can occur has not previously been subjected to rigorous definition and analysis. In particular, while some qualitative drift categorizations have been proposed, few have been formally defined, and the quantitative descriptions required for precise and objective understanding of learner performance have not existed. We present the first comprehensive framework for quantitative analysis of drift. This supports the development of the first comprehensive set of formal definitions of types of concept drift. The formal definitions clarify ambiguities and identify gaps in previous definitions, giving rise to a new comprehensive taxonomy of concept drift types and a solid foundation for research into mechanisms to detect and address concept drift.},
doi = {10.1007/s10618-015-0448-4},
keywords = {Concept Drift},
related = {learning-from-non-stationary-distributions},
url = {https://rdcu.be/7vMN},
}
ABSTRACT Most machine learning models are static, but the world is dynamic, and increasing online deployment of learned models gives increasing urgency to the development of efficient and effective mechanisms to address learning in the context of non-stationary distributions, or as it is commonly called concept drift. However, the key issue of characterizing the different types of drift that can occur has not previously been subjected to rigorous definition and analysis. In particular, while some qualitative drift categorizations have been proposed, few have been formally defined, and the quantitative descriptions required for precise and objective understanding of learner performance have not existed. We present the first comprehensive framework for quantitative analysis of drift. This supports the development of the first comprehensive set of formal definitions of types of concept drift. The formal definitions clarify ambiguities and identify gaps in previous definitions, giving rise to a new comprehensive taxonomy of concept drift types and a solid foundation for research into mechanisms to detect and address concept drift.

Petitjean, F., & Webb, G. I.
Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16, pp. 2131-2132, 2016.
[Bibtex]  → Access on publisher site  → Related papers and software

@InProceedings{PetitjeanWebbTut16,
author = {Petitjean, F. and Webb, G. I.},
booktitle = {Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16},
title = {Scalable Learning of Graphical Models},
year = {2016},
pages = {2131-2132},
publisher = {ACM Press},
keywords = {scalable graphical models and Learning from large datasets and DP140100087},
related = {scalable-graphical-modeling},
url = {http://dl.acm.org/authorize?N19101},
}
ABSTRACT 

Webb, G. I., & Petitjean, F.
Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16, pp. 1255-1264, 2016.
Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{WebbPetitjean16,
Title = {A multiple test correction for streams and cascades of statistical hypothesis tests},
Author = {Webb, Geoffrey I. and Petitjean, Francois},
Booktitle = {Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16},
Year = {2016},
Pages = {1255-1264},
Publisher = {ACM Press},
Abstract = {Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance.
This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed.
To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models.
We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.},
Comment = {Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue},
Doi = {10.1145/2939672.2939775},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {statistically-sound-association-discovery},
Url = {http://dl.acm.org/authorize?N19100}
}
ABSTRACT Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance. This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed. To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models. We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.

Porebski, B. T., Keleher, S., Hollins, J. J., Nickson, A. A., Marijanovic, E. M., Borg, N. A., Costa, M. G. S., Pearce, M. A., Dai, W., Zhu, L., Irving, J. A., Hoke, D. E., Kass, I., Whisstock, J. C., Bottomley, S. P., Webb, G. I., McGowan, S., & Buckle, A. M.
Scientific Reports, 6, Art. no. 33958, 2016.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Porebski2016,
Title = {Smoothing a rugged protein folding landscape by sequence-based redesign},
Author = {Porebski, Benjamin T. and Keleher, Shani and Hollins, Jeffrey J. and Nickson, Adrian A. and Marijanovic, Emilia M. and Borg, Natalie A. and Costa, Mauricio G. S. and Pearce, Mary A. and Dai, Weiwen and Zhu, Liguang and Irving, James A. and Hoke, David E. and Kass, Itamar and Whisstock, James C. and Bottomley, Stephen P. and Webb, Geoffrey I. and McGowan, Sheena and Buckle, Ashley M.},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
Abstract = {The rugged folding landscapes of functional proteins puts them at risk of misfolding and aggregation. Serine protease inhibitors, or serpins, are paradigms for this delicate balance between function and misfolding. Serpins exist in a metastable state that undergoes a major conformational change in order to inhibit proteases. However, conformational labiality of the native serpin fold renders them susceptible to misfolding, which underlies misfolding diseases such as alpha1-antitrypsin deficiency. To investigate how serpins balance function and folding, we used consensus design to create conserpin, a synthetic serpin that folds reversibly, is functional, thermostable, and polymerization resistant. Characterization of its structure, folding and dynamics suggest that consensus design has remodeled the folding landscape to reconcile competing requirements for stability and function. This approach may offer general benefits for engineering functional proteins that have risky folding landscapes, including the removal of aggregation-prone intermediates, and modifying scaffolds for use as protein therapeutics.},
Articlenumber = {33958},
Doi = {10.1038/srep33958},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1038/srep33958}
}
ABSTRACT The rugged folding landscapes of functional proteins puts them at risk of misfolding and aggregation. Serine protease inhibitors, or serpins, are paradigms for this delicate balance between function and misfolding. Serpins exist in a metastable state that undergoes a major conformational change in order to inhibit proteases. However, conformational labiality of the native serpin fold renders them susceptible to misfolding, which underlies misfolding diseases such as alpha1-antitrypsin deficiency. To investigate how serpins balance function and folding, we used consensus design to create conserpin, a synthetic serpin that folds reversibly, is functional, thermostable, and polymerization resistant. Characterization of its structure, folding and dynamics suggest that consensus design has remodeled the folding landscape to reconcile competing requirements for stability and function. This approach may offer general benefits for engineering functional proteins that have risky folding landscapes, including the removal of aggregation-prone intermediates, and modifying scaffolds for use as protein therapeutics.

Zaidi, N. A., Petitjean, F., & Webb, G. I.
Proceedings of the 20th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining, PAKDD 2016, pp. 341-353, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ZaidiEtAl16,
Title = {Preconditioning an Artificial Neural Network Using Naive {Bayes}},
Author = {Zaidi, Nayyar A. and Petitjean, Francois and Webb, Geoffrey I.},
Booktitle = {Proceedings of the 20th {Pacific-Asia} Conference on Advances in Knowledge Discovery and Data Mining, {PAKDD} 2016},
Year = {2016},
Editor = {Bailey, James and Khan, Latifur and Washio, Takashi and Dobbie, Gill and Huang, Zhexue Joshua and Wang, Ruili},
Pages = {341-353},
Publisher = {Springer International Publishing},
Abstract = {Logistic Regression (LR) is a workhorse of the statistics community and a state-of-the-art machine learning classifier. It learns a linear model from inputs to outputs trained by optimizing the Conditional Log-Likelihood (CLL) of the data. Recently, it has been shown that preconditioning LR using a Naive Bayes (NB) model speeds up LR learning many-fold. One can, however, train a linear model by optimizing the mean-square-error (MSE) instead of CLL. This leads to an Artificial Neural Network (ANN) with no hidden layer. In this work, we study the effect of NB preconditioning on such an ANN classifier. Optimizing MSE instead of CLL may lead to a lower bias classifier and hence result in better performance on big datasets. We show that this NB preconditioning can speed-up convergence significantly. We also show that optimizing a linear model with MSE leads to a lower bias classifier than optimizing with CLL. We also compare the performance to state-of-the-art classifier Random Forest.},
Doi = {10.1007/978-3-319-31753-3_28},
ISBN = {978-3-319-31753-3},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://dx.doi.org/10.1007/978-3-319-31753-3_28}
}
ABSTRACT Logistic Regression (LR) is a workhorse of the statistics community and a state-of-the-art machine learning classifier. It learns a linear model from inputs to outputs trained by optimizing the Conditional Log-Likelihood (CLL) of the data. Recently, it has been shown that preconditioning LR using a Naive Bayes (NB) model speeds up LR learning many-fold. One can, however, train a linear model by optimizing the mean-square-error (MSE) instead of CLL. This leads to an Artificial Neural Network (ANN) with no hidden layer. In this work, we study the effect of NB preconditioning on such an ANN classifier. Optimizing MSE instead of CLL may lead to a lower bias classifier and hence result in better performance on big datasets. We show that this NB preconditioning can speed-up convergence significantly. We also show that optimizing a linear model with MSE leads to a lower bias classifier than optimizing with CLL. We also compare the performance to state-of-the-art classifier Random Forest.

Zaidi, N. A., Webb, G. I., Carman, M. J., Petitjean, F., & Cerquides, J.
Machine Learning, 104(2), 151-194, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{ZaidiEtAl16b,
author = {Zaidi, Nayyar A. and Webb, Geoffrey I. and Carman, Mark J. and Petitjean, Francois and Cerquides, Jesus},
journal = {Machine Learning},
title = {{ALRn}: Accelerated higher-order logistic regression},
year = {2016},
issn = {1573-0565},
number = {2},
pages = {151-194},
volume = {104},
abstract = {This paper introduces Accelerated Logistic Regression: a hybrid generative-discriminative approach to training Logistic Regression with high-order features. We present two main results: (1) that our combined generative-discriminative approach significantly improves the efficiency of Logistic Regression and (2) that incorporating higher order features (i.e. features that are the Cartesian products of the original features) reduces the bias of Logistic Regression, which in turn significantly reduces its error on large datasets. We assess the efficacy of Accelerated Logistic Regression by conducting an extensive set of experiments on 75 standard datasets. We demonstrate its competitiveness, particularly on large datasets, by comparing against state-of-the-art classifiers including Random Forest and Averaged n-Dependence Estimators.},
doi = {10.1007/s10994-016-5574-8},
keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
related = {combining-generative-and-discriminative-learning},
url = {http://rdcu.be/unVb},
}
ABSTRACT This paper introduces Accelerated Logistic Regression: a hybrid generative-discriminative approach to training Logistic Regression with high-order features. We present two main results: (1) that our combined generative-discriminative approach significantly improves the efficiency of Logistic Regression and (2) that incorporating higher order features (i.e. features that are the Cartesian products of the original features) reduces the bias of Logistic Regression, which in turn significantly reduces its error on large datasets. We assess the efficacy of Accelerated Logistic Regression by conducting an extensive set of experiments on 75 standard datasets. We demonstrate its competitiveness, particularly on large datasets, by comparing against state-of-the-art classifiers including Random Forest and Averaged n-Dependence Estimators.

Zhang, A., Shi, W., & Webb, G. I.
Data Mining and Knowledge Discovery, 30(4), 928-963, 2016.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{ZhangEtAl16,
Title = {Mining significant association rules from uncertain data},
Author = {Zhang, Anshu and Shi, Wenzhong and Webb, Geoffrey I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2016},
Number = {4},
Pages = {928-963},
Volume = {30},
Abstract = {In association rule mining, the trade-off between avoiding harmful spurious rules and preserving authentic ones is an ever critical barrier to obtaining reliable and useful results. The statistically sound technique for evaluating statistical significance of association rules is superior in preventing spurious rules, yet can also cause severe loss of true rules in presence of data error. This study presents a new and improved method for statistical test on association rules with uncertain erroneous data. An original mathematical model was established to describe data error propagation through computational procedures of the statistical test. Based on the error model, a scheme combining analytic and simulative processes was designed to correct the statistical test for distortions caused by data error. Experiments on both synthetic and real-world data show that the method significantly recovers the loss in true rules (reduces type-2 error) due to data error occurring in original statistically sound method. Meanwhile, the new method maintains effective control over the familywise error rate, which is the distinctive advantage of the original statistically sound technique. Furthermore, the method is robust against inaccurate data error probability information and situations not fulfilling the commonly accepted assumption on independent error probabilities of different data items. The method is particularly effective for rules which were most practically meaningful yet sensitive to data error. The method proves promising in enhancing values of association rule mining results and helping users make correct decisions.},
Doi = {10.1007/s10618-015-0446-6},
Keywords = {Association Rule Discovery and statistically sound discovery},
Publisher = {Springer},
Related = {statistically-sound-association-discovery}
}
ABSTRACT In association rule mining, the trade-off between avoiding harmful spurious rules and preserving authentic ones is an ever critical barrier to obtaining reliable and useful results. The statistically sound technique for evaluating statistical significance of association rules is superior in preventing spurious rules, yet can also cause severe loss of true rules in presence of data error. This study presents a new and improved method for statistical test on association rules with uncertain erroneous data. An original mathematical model was established to describe data error propagation through computational procedures of the statistical test. Based on the error model, a scheme combining analytic and simulative processes was designed to correct the statistical test for distortions caused by data error. Experiments on both synthetic and real-world data show that the method significantly recovers the loss in true rules (reduces type-2 error) due to data error occurring in original statistically sound method. Meanwhile, the new method maintains effective control over the familywise error rate, which is the distinctive advantage of the original statistically sound technique. Furthermore, the method is robust against inaccurate data error probability information and situations not fulfilling the commonly accepted assumption on independent error probabilities of different data items. The method is particularly effective for rules which were most practically meaningful yet sensitive to data error. The method proves promising in enhancing values of association rule mining results and helping users make correct decisions.

Wang, H., Feng, L., Zhang, Z., Webb, G. I., Lin, D., & Song, J.
Scientific Reports, 6, Art. no. 21383, 2016.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WangEtAl16,
Title = {Crysalis: an integrated server for computational analysis and design of protein crystallization},
Author = {Wang, H. and Feng, L. and Zhang, Z. and Webb, G. I. and Lin, D. and Song, J.},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
Abstract = {The failure of multi-step experimental procedures to yield diffraction-quality crystals is a major bottleneck in protein structure determination. Accordingly, several bioinformatics methods have been successfully developed and employed to select crystallizable proteins. Unfortunately, the majority of existing in silico methods only allow the prediction of crystallization propensity, seldom enabling computational design of protein mutants that can be targeted for enhancing protein crystallizability. Here, we present Crysalis, an integrated crystallization analysis tool that builds on support-vector regression (SVR) models to facilitate computational protein crystallization prediction, analysis, and design. More specifically, the functionality of this new tool includes: (1) rapid selection of target crystallizable proteins at the proteome level, (2) identification of site non-optimality for protein crystallization and systematic analysis of all potential single-point mutations that might enhance protein crystallization propensity, and (3) annotation of target protein based on predicted structural properties. We applied the design mode of Crysalis to identify site non-optimality for protein crystallization on a proteome-scale, focusing on proteins currently classified as non-crystallizable. Our results revealed that site non-optimality is based on biases related to residues, predicted structures, physicochemical properties, and sequence loci, which provides in-depth understanding of the features influencing protein crystallization. Crysalis is freely available at http://nmrcen.xmu.edu.cn/crysalis/.},
Articlenumber = {21383},
Doi = {10.1038/srep21383},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT The failure of multi-step experimental procedures to yield diffraction-quality crystals is a major bottleneck in protein structure determination. Accordingly, several bioinformatics methods have been successfully developed and employed to select crystallizable proteins. Unfortunately, the majority of existing in silico methods only allow the prediction of crystallization propensity, seldom enabling computational design of protein mutants that can be targeted for enhancing protein crystallizability. Here, we present Crysalis, an integrated crystallization analysis tool that builds on support-vector regression (SVR) models to facilitate computational protein crystallization prediction, analysis, and design. More specifically, the functionality of this new tool includes: (1) rapid selection of target crystallizable proteins at the proteome level, (2) identification of site non-optimality for protein crystallization and systematic analysis of all potential single-point mutations that might enhance protein crystallization propensity, and (3) annotation of target protein based on predicted structural properties. We applied the design mode of Crysalis to identify site non-optimality for protein crystallization on a proteome-scale, focusing on proteins currently classified as non-crystallizable. Our results revealed that site non-optimality is based on biases related to residues, predicted structures, physicochemical properties, and sequence loci, which provides in-depth understanding of the features influencing protein crystallization. Crysalis is freely available at http://nmrcen.xmu.edu.cn/crysalis/.

Petitjean, F., & Webb, G. I.
Proceedings of the 2015 SIAM International Conference on Data Mining, pp. 469-477, 2015.
Best Research Paper Honorable Mention Award
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{PetitjeanWebb15,
author = {Petitjean, F. and Webb, G. I.},
booktitle = {Proceedings of the 2015 {SIAM} International Conference on Data Mining},
title = {Scaling log-linear analysis to datasets with thousands of variables},
year = {2015},
pages = {469-477},
abstract = {Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.},
comment = {Best Research Paper Honorable Mention Award},
keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
related = {scalable-graphical-modeling},
url = {http://epubs.siam.org/doi/pdf/10.1137/1.9781611974010.53},
}
ABSTRACT Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.

Porebski, B. T., Nickson, A. A., Hoke, D. E., Hunter, M. R., Zhu, L., McGowan, S., Webb, G. I., & Buckle, A. M.
Protein Engineering, Design and Selection, 28(3), 67-78, 2015.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{PorebskiEtAl15,
Title = {Structural and dynamic properties that govern the stability of an engineered fibronectin type III domain},
Author = {Porebski, B. T. and Nickson, A. A. and Hoke, D. E. and Hunter, M. R. and Zhu, L. and McGowan, S. and Webb, G. I. and Buckle, A. M.},
Journal = {Protein Engineering, Design and Selection},
Year = {2015},
Number = {3},
Pages = {67-78},
Volume = {28},
Abstract = {Consensus protein design is a rapid and reliable technique for the improvement of protein stability, which relies on the use of homologous protein sequences. To enhance the stability of a fibronectin type III (FN3) domain, consensus design was employed using an alignment of 2123 sequences. The resulting FN3 domain, FN3con, has unprecedented stability, with a melting temperature >100�C, a .GD.N of 15.5 kcal mol.1 and a greatly reduced unfolding rate compared with wild-type. To determine the underlying molecular basis for stability, an X-ray crystal structure of FN3con was determined to 2.0 � and compared with other FN3 domains of varying stabilities. The structure of FN3con reveals significantly increased salt bridge interactions that are cooperatively networked, and a highly optimized hydrophobic core. Molecular dynamics simulations of FN3con and comparison structures show the cooperative power of electrostatic and hydrophobic networks in improving FN3con stability. Taken together, our data reveal that FN3con stability does not result from a single mechanism, but rather the combination of several features and the removal of non-conserved, unfavorable interactions. The large number of sequences employed in this study has most likely enhanced the robustness of the consensus design, which is now possible due to the increased sequence availability in the post-genomic era. These studies increase our knowledge of the molecular mechanisms that govern stability and demonstrate the rising potential for enhancing stability via the consensus method.},
Doi = {10.1093/protein/gzv002},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://peds.oxfordjournals.org/content/28/3/67.full.pdf+html}
}
ABSTRACT Consensus protein design is a rapid and reliable technique for the improvement of protein stability, which relies on the use of homologous protein sequences. To enhance the stability of a fibronectin type III (FN3) domain, consensus design was employed using an alignment of 2123 sequences. The resulting FN3 domain, FN3con, has unprecedented stability, with a melting temperature >100�C, a .GD.N of 15.5 kcal mol.1 and a greatly reduced unfolding rate compared with wild-type. To determine the underlying molecular basis for stability, an X-ray crystal structure of FN3con was determined to 2.0 � and compared with other FN3 domains of varying stabilities. The structure of FN3con reveals significantly increased salt bridge interactions that are cooperatively networked, and a highly optimized hydrophobic core. Molecular dynamics simulations of FN3con and comparison structures show the cooperative power of electrostatic and hydrophobic networks in improving FN3con stability. Taken together, our data reveal that FN3con stability does not result from a single mechanism, but rather the combination of several features and the removal of non-conserved, unfavorable interactions. The large number of sequences employed in this study has most likely enhanced the robustness of the consensus design, which is now possible due to the increased sequence availability in the post-genomic era. These studies increase our knowledge of the molecular mechanisms that govern stability and demonstrate the rising potential for enhancing stability via the consensus method.

Proceedings of the 21st ACM SIGKDD International Conference on Knowledge Discovery and Data Mining
Cao, L., Zhang, C., Joachims, T., Webb, G. I., Margineantu, D. D., & Williams, G. (Ed).
ACM, 2015.
[Bibtex]  → Access on publisher site

@Proceedings{WebbKDD2015,
title = {Proceedings of the 21st {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining},
year = {2015},
editor = {Cao, L. and Zhang, C. and Joachims, T. and Webb, G. I. and Margineantu, D. D. and Williams, G.},
publisher = {ACM},
url = {http://dl.acm.org/citation.cfm?id=2783258&CFID=585807029&CFTOKEN=47444098},
}
ABSTRACT 

Li, F., Li, C., Wang, M., Webb, G. I., Zhang, Y., Whisstock, J. C., & Song, J.
Bioinformatics, 31(9), 1411-1419, 2015.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{LiEtAl15,
Title = {GlycoMine: a machine learning-based approach for predicting N-, C- and O-linked glycosylation in the human proteome},
Author = {Li, F. and Li, C. and Wang, M. and Webb, G. I. and Zhang, Y. and Whisstock, J. C. and Song, J.},
Journal = {Bioinformatics},
Year = {2015},
Number = {9},
Pages = {1411-1419},
Volume = {31},
Abstract = {Motivation: Glycosylation is a ubiquitous type of protein post-translational modification (PTM) in eukaryotic cells, which plays vital roles in various biological processes (BPs) such as cellular communication, ligand recognition and subcellular recognition. It is estimated that >50% of the entire human proteome is glycosylated. However, it is still a significant challenge to identify glycosylation sites, which requires expensive/laborious experimental research. Thus, bioinformatics approaches that can predict the glycan occupancy at specific sequons in protein sequences would be useful for understanding and utilizing this important PTM.
Results: In this study, we present a novel bioinformatics tool called GlycoMine, which is a comprehensive tool for the systematic in silico identification of C-linked, N-linked, and O-linked glycosylation sites in the human proteome. GlycoMine was developed using the random forest algorithm and evaluated based on a well-prepared up-to-date benchmark dataset that encompasses all three types of glycosylation sites, which was curated from multiple public resources. Heterogeneous sequences and functional features were derived from various sources, and subjected to further two-step feature selection to characterize a condensed subset of optimal features that contributed most to the type-specific prediction of glycosylation sites. Five-fold cross-validation and independent tests show that this approach significantly improved the prediction performance compared with four existing prediction tools: NetNGlyc, NetOGlyc, EnsembleGly and GPP. We demonstrated that this tool could identify candidate glycosylation sites in case study proteins and applied it to identify many high-confidence glycosylation target proteins by screening the entire human proteome.},
Doi = {10.1093/bioinformatics/btu852},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT Motivation: Glycosylation is a ubiquitous type of protein post-translational modification (PTM) in eukaryotic cells, which plays vital roles in various biological processes (BPs) such as cellular communication, ligand recognition and subcellular recognition. It is estimated that >50% of the entire human proteome is glycosylated. However, it is still a significant challenge to identify glycosylation sites, which requires expensive/laborious experimental research. Thus, bioinformatics approaches that can predict the glycan occupancy at specific sequons in protein sequences would be useful for understanding and utilizing this important PTM. Results: In this study, we present a novel bioinformatics tool called GlycoMine, which is a comprehensive tool for the systematic in silico identification of C-linked, N-linked, and O-linked glycosylation sites in the human proteome. GlycoMine was developed using the random forest algorithm and evaluated based on a well-prepared up-to-date benchmark dataset that encompasses all three types of glycosylation sites, which was curated from multiple public resources. Heterogeneous sequences and functional features were derived from various sources, and subjected to further two-step feature selection to characterize a condensed subset of optimal features that contributed most to the type-specific prediction of glycosylation sites. Five-fold cross-validation and independent tests show that this approach significantly improved the prediction performance compared with four existing prediction tools: NetNGlyc, NetOGlyc, EnsembleGly and GPP. We demonstrated that this tool could identify candidate glycosylation sites in case study proteins and applied it to identify many high-confidence glycosylation target proteins by screening the entire human proteome.

Zaidi, N., Carman, M., Cerquides, J., & Webb, G. I.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 1097-1102, 2014.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ZaidiEtAl14,
author = {Zaidi, N. and Carman, M. and Cerquides, J. and Webb, G. I.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {Naive-{Bayes} Inspired Effective Pre-Conditioner for Speeding-up Logistic Regression},
year = {2014},
pages = {1097-1102},
abstract = {We propose an alternative parameterization of
Logistic Regression (LR) for the categorical data, multi-class
setting. LR optimizes the conditional log-likelihood over the
training data and is based on an iterative optimization procedure
to tune this objective function. The optimization procedure
employed may be sensitive to scale and hence an effective
pre-conditioning method is recommended. Many problems in
machine learning involve arbitrary scales or categorical data
(where simple standardization of features is not applicable).
The problem can be alleviated by using optimization routines
that are invariant to scale such as (second-order) Newton
methods. However, computing and inverting the Hessian is a
costly procedure and not feasible for big data. Thus one must
often rely on first-order methods such as gradient descent (GD),
stochastic gradient descent (SGD) or approximate secondorder
such as quasi-Newton (QN) routines, which are not
invariant to scale. This paper proposes a simple yet effective
pre-conditioner for speeding-up LR based on naive Bayes
conditional probability estimates. The idea is to scale each
attribute by the log of the conditional probability of that
attribute given the class. This formulation substantially speeds up
LR's convergence. It also provides a weighted naive Bayes
formulation which yields an effective framework for hybrid
generative-discriminative classification.},
doi = {10.1109/ICDM.2014.53},
keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
related = {combining-generative-and-discriminative-learning},
}
ABSTRACT We propose an alternative parameterization of Logistic Regression (LR) for the categorical data, multi-class setting. LR optimizes the conditional log-likelihood over the training data and is based on an iterative optimization procedure to tune this objective function. The optimization procedure employed may be sensitive to scale and hence an effective pre-conditioning method is recommended. Many problems in machine learning involve arbitrary scales or categorical data (where simple standardization of features is not applicable). The problem can be alleviated by using optimization routines that are invariant to scale such as (second-order) Newton methods. However, computing and inverting the Hessian is a costly procedure and not feasible for big data. Thus one must often rely on first-order methods such as gradient descent (GD), stochastic gradient descent (SGD) or approximate secondorder such as quasi-Newton (QN) routines, which are not invariant to scale. This paper proposes a simple yet effective pre-conditioner for speeding-up LR based on naive Bayes conditional probability estimates. The idea is to scale each attribute by the log of the conditional probability of that attribute given the class. This formulation substantially speeds up LR's convergence. It also provides a weighted naive Bayes formulation which yields an effective framework for hybrid generative-discriminative classification.

Webb, G. I.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 1031-1036, 2014.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{Webb14,
author = {Webb, G. I.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {Contrary to Popular Belief Incremental Discretization can be Sound, Computationally Efficient and Extremely Useful for Streaming Data},
year = {2014},
pages = {1031-1036},
abstract = {Discretization of streaming data has received surprisingly
little attention. This might be because streaming data
require incremental discretization with cutpoints that may vary
over time and this is perceived as undesirable. We argue, to
the contrary, that it can be desirable for a discretization to
evolve in synchronization with an evolving data stream, even
when the learner assumes that attribute values. meanings remain
invariant over time. We examine the issues associated with
discretization in the context of distribution drift and develop
computationally efficient incremental discretization algorithms.
We show that discretization can reduce the error of a classical
incremental learner and that allowing a discretization to drift in
synchronization with distribution drift can further reduce error.},
doi = {10.1109/ICDM.2014.123},
keywords = {Concept Drift and Discretization and Incremental Learning and Stream mining},
related = {learning-from-non-stationary-distributions},
}
ABSTRACT Discretization of streaming data has received surprisingly little attention. This might be because streaming data require incremental discretization with cutpoints that may vary over time and this is perceived as undesirable. We argue, to the contrary, that it can be desirable for a discretization to evolve in synchronization with an evolving data stream, even when the learner assumes that attribute values. meanings remain invariant over time. We examine the issues associated with discretization in the context of distribution drift and develop computationally efficient incremental discretization algorithms. We show that discretization can reduce the error of a classical incremental learner and that allowing a discretization to drift in synchronization with distribution drift can further reduce error.

Webb, G. I., & Vreeken, J.
ACM Transactions on Knowledge Discovery from Data, 8(3), Art. no. 15, 2014.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WebbVreeken13,
author = {Webb, G. I. and Vreeken, J.},
journal = {{ACM} Transactions on Knowledge Discovery from Data},
title = {Efficient Discovery of the Most Interesting Associations},
year = {2014},
number = {3},
volume = {8},
abstract = {Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations
in data. However, their computation appears highly demanding, as assessing whether an itemset is selfsufficient
requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as
consideration of all supersets. This paper presents the first published algorithm for efficiently discovering
self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms
based on upper-bounds on itemset value and statistical significance level. It demonstrates that finding top-k
productive and non-redundant itemsets, with post processing to identify those that are not independently
productive, can efficiently identify small sets of key associations. We present extensive evaluation of the
strengths and limitations of the technique including comparisons with alternative approaches to finding the
most interesting associations.},
articlenumber = {15},
doi = {10.1145/2601433},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
publisher = {ACM},
related = {filtered-top-k-association-discovery},
url = {http://dl.acm.org/authorize?N80829},
}
ABSTRACT Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations in data. However, their computation appears highly demanding, as assessing whether an itemset is selfsufficient requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as consideration of all supersets. This paper presents the first published algorithm for efficiently discovering self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms based on upper-bounds on itemset value and statistical significance level. It demonstrates that finding top-k productive and non-redundant itemsets, with post processing to identify those that are not independently productive, can efficiently identify small sets of key associations. We present extensive evaluation of the strengths and limitations of the technique including comparisons with alternative approaches to finding the most interesting associations.

Petitjean, F., Allison, L., & Webb, G. I.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 480-489, 2014.
One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{PetitjeanEtAl14a,
author = {Petitjean, F. and Allison, L. and Webb, G. I.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {A Statistically Efficient and Scalable Method for Log-Linear Analysis of High-Dimensional Data},
year = {2014},
pages = {480-489},
abstract = {Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches -- statistical efficiency -- 2) controls for the risk of false discoveries as well as statistical approaches -- high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer -- computational efficiency.},
comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and DP140100087},
related = {scalable-graphical-modeling},
url = {http://dx.doi.org/10.1109/ICDM.2014.23},
}
ABSTRACT Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches – statistical efficiency – 2) controls for the risk of false discoveries as well as statistical approaches – high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer – computational efficiency.

Chen, S., Martinez, A., & Webb, G. I.
Proceedings of the 18th Pacific-Asia Conference on Knowledge Discovery and Data Mining, pp. 86-97, 2014.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ChenEtAl14,
author = {Chen, S. and Martinez, A. and Webb, G. I.},
booktitle = {Proceedings of the 18th {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining},
title = {Highly Scalable Attribute Selection for AODE},
year = {2014},
pages = {86-97},
abstract = {Averaged One-Dependence Estimators (AODE) is a popular
and effective approach to Bayesian learning. In this paper, a new
attribute selection approach is proposed for AODE. It can search in a
large model space, while it requires only a single extra pass through the
training data, resulting in a computationally efficient two-pass learning
algorithm. The experimental results indicate that the new technique significantly
reduces AODE.s bias at the cost of a modest increase in training
time. Its low bias and computational efficiency make it an attractive
algorithm for learning from big data.},
doi = {10.1007/978-3-319-06605-9_8},
keywords = {Conditional Probability Estimation and AODE and DP140100087},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Averaged One-Dependence Estimators (AODE) is a popular and effective approach to Bayesian learning. In this paper, a new attribute selection approach is proposed for AODE. It can search in a large model space, while it requires only a single extra pass through the training data, resulting in a computationally efficient two-pass learning algorithm. The experimental results indicate that the new technique significantly reduces AODE.s bias at the cost of a modest increase in training time. Its low bias and computational efficiency make it an attractive algorithm for learning from big data.

Petitjean, F., Forestier, G., Webb, G. I., Nicholson, A., Chen, Y., & Keogh, E.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 470-479, 2014.
One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{PetitjeanEtAl14b,
author = {Petitjean, F. and Forestier, G. and Webb, G. I. and Nicholson, A. and Chen, Y. and Keogh, E.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {Dynamic Time Warping Averaging of Time Series Allows Faster and More Accurate Classification},
year = {2014},
pages = {470-479},
abstract = {Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.},
comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
doi = {10.1109/ICDM.2014.27},
keywords = {time series},
related = {scalable-time-series-classifiers},
}
ABSTRACT Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.

Li, Y., Wang, M., Wang, H., Tan, H., Zhang, Z., Webb, G. I., & Song, J.
Scientific Reports, 4, Art. no. 5765, 2014.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{LiEtAl2014,
author = {Li, Y. and Wang, M. and Wang, H. and Tan, H. and Zhang, Z. and Webb, G. I. and Song, J.},
journal = {Scientific Reports},
title = {Accurate in Silico Identification of Species-Specific Acetylation Sites by Integrating Protein Sequence-Derived and Functional Features},
year = {2014},
volume = {4},
abstract = {Lysine acetylation is a reversible post-translational modification, playing an important role in cytokine signaling, transcriptional regulation, and apoptosis. To fully understand acetylation mechanisms, identification of substrates and specific acetylation sites is crucial. Experimental identification is often time-consuming and expensive. Alternative bioinformatics methods are cost-effective and can be used in a high-throughput manner to generate relatively precise predictions. Here we develop a method termed as SSPKA for species-specific lysine acetylation prediction, using random forest classifiers that combine sequence-derived and functional features with two-step feature selection. Feature importance analysis indicates functional features, applied for lysine acetylation site prediction for the first time, significantly improve the predictive performance. We apply the SSPKA model to screen the entire human proteome and identify many high-confidence putative substrates that are not previously identified. The results along with the implemented Java tool, serve as useful resources to elucidate the mechanism of lysine acetylation and facilitate hypothesis-driven experimental design and validation.},
articlenumber = {5765},
doi = {10.1038/srep05765},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT Lysine acetylation is a reversible post-translational modification, playing an important role in cytokine signaling, transcriptional regulation, and apoptosis. To fully understand acetylation mechanisms, identification of substrates and specific acetylation sites is crucial. Experimental identification is often time-consuming and expensive. Alternative bioinformatics methods are cost-effective and can be used in a high-throughput manner to generate relatively precise predictions. Here we develop a method termed as SSPKA for species-specific lysine acetylation prediction, using random forest classifiers that combine sequence-derived and functional features with two-step feature selection. Feature importance analysis indicates functional features, applied for lysine acetylation site prediction for the first time, significantly improve the predictive performance. We apply the SSPKA model to screen the entire human proteome and identify many high-confidence putative substrates that are not previously identified. The results along with the implemented Java tool, serve as useful resources to elucidate the mechanism of lysine acetylation and facilitate hypothesis-driven experimental design and validation.

Provost, F., Webb, G. I., Bekkerman, R., Etzioni, O., Fayyad, U., & Perlich, C.
Big Data, 2(3), 117-128, 2014.
[Bibtex] [Abstract]  → Access on publisher site

@Article{ProvostEtAl14,
Title = {A Data Scientist's Guide to Start-Ups},
Author = {Provost, F. and Webb, G. I. and Bekkerman, R. and Etzioni, O. and Fayyad, U. and Perlich, C.},
Journal = {Big Data},
Year = {2014},
Number = {3},
Pages = {117-128},
Volume = {2},
Abstract = {In August 2013, we held a panel discussion at the KDD 2013 conference in Chicago on the subject of data science, data scientists, and start-ups. KDD is the premier conference on data science research and practice. The panel discussed the pros and cons for top-notch data scientists of the hot data science start-up scene. In this article, we first present background on our panelists. Our four panelists have unquestionable pedigrees in data science and substantial experience with start-ups from multiple perspectives (founders, employees, chief scientists, venture capitalists). For the casual reader, we next present a brief summary of the experts' opinions on eight of the issues the panel discussed. The rest of the article presents a lightly edited transcription of the entire panel discussion.},
Keywords = {Big Data},
Url = {http://dx.doi.org/10.1089/big.2014.0031}
}
ABSTRACT In August 2013, we held a panel discussion at the KDD 2013 conference in Chicago on the subject of data science, data scientists, and start-ups. KDD is the premier conference on data science research and practice. The panel discussed the pros and cons for top-notch data scientists of the hot data science start-up scene. In this article, we first present background on our panelists. Our four panelists have unquestionable pedigrees in data science and substantial experience with start-ups from multiple perspectives (founders, employees, chief scientists, venture capitalists). For the casual reader, we next present a brief summary of the experts' opinions on eight of the issues the panel discussed. The rest of the article presents a lightly edited transcription of the entire panel discussion.

Provost, F., & Webb, G. I.
Proceedings of the 9th ACM SIGKDD International Conference on knowledge Discovery and Data Mining, pp. 1445-1445, 2013.
[Bibtex]  → Access on publisher site

@InProceedings{ProvostWebb13,
Title = {Panel: a data scientist's guide to making money from start-ups},
Author = {Provost, F. and Webb, G. I.},
Booktitle = {Proceedings of the 9th {ACM} {SIGKDD} International Conference on knowledge Discovery and Data Mining},
Year = {2013},
Pages = {1445-1445},
Url = {http://dl.acm.org/citation.cfm?doid=2487575.2494523}
}
ABSTRACT 

Zaidi, N., & Webb, G. I.
Proceedings of the 17th Pacific-Asia Conference on Knowledge Discovery and Data Mining, pp. 149-160, 2013.
[Bibtex]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ZaidiWebb13,
Title = {Fast and Effective Single Pass Bayesian Learning},
Author = {Zaidi, N. and Webb, G. I.},
Booktitle = {Proceedings of the 17th {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining},
Year = {2013},
Pages = {149-160},
Doi = {10.1007/978-3-642-37453-1_13},
Keywords = {Conditional Probability Estimation and AODE},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT 

Zaidi, N. A., Cerquides, J., Carman, M. J., & Webb, G. I.
Journal of Machine Learning Research, 14, 1947-1988, 2013.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Zaidi2013,
Title = {Alleviating Naive Bayes Attribute Independence Assumption by Attribute Weighting},
Author = {Zaidi, Nayyar A. and Cerquides, Jesus and Carman, Mark J. and Webb, Geoffrey I.},
Journal = {Journal of Machine Learning Research},
Year = {2013},
Pages = {1947-1988},
Volume = {14},
Abstract = {Despite the simplicity of the Naive Bayes classifier, it has continued to perform well against more sophisticated newcomers and has remained, therefore, of great interest to the machine learning community. Of numerous approaches to refining the naive Bayes classifier, attribute weighting has received less attention than it warrants. Most approaches, perhaps influenced by attribute weighting in other machine learning algorithms, use weighting to place more emphasis on highly predictive attributes than those that are less predictive. In this paper, we argue that for naive Bayes attribute weighting should instead be used to alleviate the conditional independence assumption. Based on this premise, we propose a weighted naive Bayes algorithm, called WANBIA, that selects weights to minimize either the negative conditional log likelihood or the mean squared error objective functions. We perform extensive evaluations and find that WANBIA is a competitive alternative to state of the art classifiers like Random Forest, Logistic Regression and A1DE.},
Keywords = {Conditional Probability Estimation and WANBIA},
Related = {combining-generative-and-discriminative-learning},
Url = {http://jmlr.org/papers/volume14/zaidi13a/zaidi13a.pdf},
Urltext = {Link to paper on JMLR site}
}
ABSTRACT Despite the simplicity of the Naive Bayes classifier, it has continued to perform well against more sophisticated newcomers and has remained, therefore, of great interest to the machine learning community. Of numerous approaches to refining the naive Bayes classifier, attribute weighting has received less attention than it warrants. Most approaches, perhaps influenced by attribute weighting in other machine learning algorithms, use weighting to place more emphasis on highly predictive attributes than those that are less predictive. In this paper, we argue that for naive Bayes attribute weighting should instead be used to alleviate the conditional independence assumption. Based on this premise, we propose a weighted naive Bayes algorithm, called WANBIA, that selects weights to minimize either the negative conditional log likelihood or the mean squared error objective functions. We perform extensive evaluations and find that WANBIA is a competitive alternative to state of the art classifiers like Random Forest, Logistic Regression and A1DE.

Suraweera, P., Webb, G. I., Evans, I., & Wallace, M.
Transportation Research Part C: Emerging Technologies, 26, 214-232, 2013.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Suraweera2013,
author = {Suraweera, P. and Webb, G. I. and Evans, I. and Wallace, M.},
journal = {Transportation Research Part C: Emerging Technologies},
title = {Learning crew scheduling constraints from historical schedules},
year = {2013},
pages = {214-232},
volume = {26},
abstract = {For most airlines, there are numerous policies, agreements and regulations that govern the workload of airline crew. Although some constraints are formally documented, there are many others based on established practice and tacit understanding. Consequently, the task of developing a formal representation of the constraints that govern the working conditions of an airline�s crew requires extensive time and effort involving interviews with the airline�s crew schedulers and detailed analysis of historical schedules. We have developed a system that infers crew scheduling constraints from historical crew schedules with the assistance of a domain expert. This system implements the ComCon algorithm developed to learn constraints that prescribe the limits of certain aspects of crew schedules. The algorithm induces complex multivariate constraints based on a set of user provided templates that outline the general structure of important constraints. The results of an evaluation conducted with crew schedules from two commercial airlines show that the system is capable of learning the majority of the minimum rest constraints.},
doi = {10.1016/j.trc.2012.08.002},
keywords = {Engineering Applications},
related = {engineering-applications},
}
ABSTRACT For most airlines, there are numerous policies, agreements and regulations that govern the workload of airline crew. Although some constraints are formally documented, there are many others based on established practice and tacit understanding. Consequently, the task of developing a formal representation of the constraints that govern the working conditions of an airline�s crew requires extensive time and effort involving interviews with the airline�s crew schedulers and detailed analysis of historical schedules. We have developed a system that infers crew scheduling constraints from historical crew schedules with the assistance of a domain expert. This system implements the ComCon algorithm developed to learn constraints that prescribe the limits of certain aspects of crew schedules. The algorithm induces complex multivariate constraints based on a set of user provided templates that outline the general structure of important constraints. The results of an evaluation conducted with crew schedules from two commercial airlines show that the system is capable of learning the majority of the minimum rest constraints.

Petitjean, F., Webb, G. I., & Nicholson, A. E.
Proceedings of the 13th IEEE International Conference on Data Mining, pp. 597-606, 2013.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{PetitjeanEtAl13,
Title = {Scaling log-linear analysis to high-dimensional data},
Author = {Petitjean, F. and Webb, G. I. and Nicholson, A. E.},
Booktitle = {Proceedings of the 13th {IEEE} International Conference on Data Mining},
Year = {2013},
Pages = {597-606},
Abstract = {Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We develop an efficient approach to log-linear analysis that scales to hundreds of variables by melding the classical statistical machinery of log-linear analysis with advanced data mining techniques from association discovery and graphical modeling.},
Doi = {10.1109/ICDM.2013.17},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {scalable-graphical-modeling}
}
ABSTRACT Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We develop an efficient approach to log-linear analysis that scales to hundreds of variables by melding the classical statistical machinery of log-linear analysis with advanced data mining techniques from association discovery and graphical modeling.

Song, J., Tan, H., Wang, M., Webb, G. I., & Akutsu, T.
PLoS ONE, 7(2), Art. no. e30361, 2012.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl12,
author = {Song, Jiangning and Tan, Hao and Wang, Mingjun and Webb, Geoffrey I. and Akutsu, Tatsuya},
journal = {PLoS ONE},
title = {TANGLE: Two-Level Support Vector Regression Approach for Protein Backbone Torsion Angle Prediction from Primary Sequences},
year = {2012},
month = {02},
number = {2},
volume = {7},
abstract = {Protein backbone torsion angles (Phi) and (Psi) involve two rotation angles rotating around the Cα-N bond (Phi)
and the Cα-C bond (Psi). Due to the planarity of the linked rigid peptide bonds, these two angles can essentially determine
the backbone geometry of proteins. Accordingly, the accurate prediction of protein backbone torsion angle from sequence information
can assist the prediction of protein structures. In this study, we develop a new approach called TANGLE (Torsion ANGLE predictor) to
predict the protein backbone torsion angles from amino acid sequences. TANGLE uses a two-level support vector regression approach to
perform real-value torsion angle prediction using a variety of features derived from amino acid sequences, including the evolutionary
profiles in the form of position-specific scoring matrices, predicted secondary structure, solvent accessibility and natively disordered
region as well as other global sequence features. When evaluated based on a large benchmark dataset of 1,526 non-homologous proteins,
the mean absolute errors (MAEs) of the Phi and Psi angle prediction are 27.8° and 44.6°, respectively, which are 1% and 3% respectively
lower than that using one of the state-of-the-art prediction tools ANGLOR. Moreover, the prediction of TANGLE is significantly better than a
random predictor that was built on the amino acid-specific basis, with the p-value<1.46e-147 and 7.97e-150, respectively by the
Wilcoxon signed rank test. As a complementary approach to the current torsion angle prediction algorithms, TANGLE should prove useful in predicting
protein structural properties and assisting protein fold recognition by applying the predicted torsion angles as useful restraints. TANGLE is freely
accessible at http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/.},
articlenumber = {e30361},
doi = {10.1371/journal.pone.0030361},
keywords = {Bioinformatics},
publisher = {Public Library of Science},
related = {computational-biology},
url = {http://dx.doi.org/10.1371%2Fjournal.pone.0030361},
}
ABSTRACT Protein backbone torsion angles (Phi) and (Psi) involve two rotation angles rotating around the Cα-N bond (Phi) and the Cα-C bond (Psi). Due to the planarity of the linked rigid peptide bonds, these two angles can essentially determine the backbone geometry of proteins. Accordingly, the accurate prediction of protein backbone torsion angle from sequence information can assist the prediction of protein structures. In this study, we develop a new approach called TANGLE (Torsion ANGLE predictor) to predict the protein backbone torsion angles from amino acid sequences. TANGLE uses a two-level support vector regression approach to perform real-value torsion angle prediction using a variety of features derived from amino acid sequences, including the evolutionary profiles in the form of position-specific scoring matrices, predicted secondary structure, solvent accessibility and natively disordered region as well as other global sequence features. When evaluated based on a large benchmark dataset of 1,526 non-homologous proteins, the mean absolute errors (MAEs) of the Phi and Psi angle prediction are 27.8° and 44.6°, respectively, which are 1% and 3% respectively lower than that using one of the state-of-the-art prediction tools ANGLOR. Moreover, the prediction of TANGLE is significantly better than a random predictor that was built on the amino acid-specific basis, with the p-value<1.46e-147 and 7.97e-150, respectively by the Wilcoxon signed rank test. As a complementary approach to the current torsion angle prediction algorithms, TANGLE should prove useful in predicting protein structural properties and assisting protein fold recognition by applying the predicted torsion angles as useful restraints. TANGLE is freely accessible at http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/.

Salem, H., Suraweera, P., Webb, G. I., & Boughton, J. R.
Proceedings of the 16th Pacific-Asia Conference, PAKDD 2012, Berlin/Heidelberg, pp. 50-61, 2012.
[Bibtex]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{SalemEtAl12,
author = {Salem, H. and Suraweera, P. and Webb, G. I. and Boughton, J. R.},
booktitle = {Proceedings of the 16th {Pacific}-{Asia} Conference, PAKDD 2012},
title = {Techniques for Efficient Learning without Search},
year = {2012},
pages = {50-61},
publisher = {Springer},
keywords = {Conditional Probability Estimation and AODE},
location = {Kuala Lumpur, Malaysia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT 

Webb, G. I., Boughton, J., Zheng, F., Ting, K. M., & Salem, H.
Machine Learning, 86(2), 233-272, 2012.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WebbEtAl12,
author = {Webb, G. I. and Boughton, J. and Zheng, F. and Ting, K. M. and Salem, H.},
journal = {Machine Learning},
title = {Learning by extrapolation from marginal to full-multivariate probability distributions: Decreasingly naive {Bayesian} classification},
year = {2012},
issn = {0885-6125},
number = {2},
pages = {233-272},
volume = {86},
abstract = {Averaged n-Dependence Estimators (AnDE) is an approach to probabilistic classification learning that learns by extrapolation from marginal
to full-multivariate probability distributions. It utilizes a single parameter that transforms the approach between a low-variance high-bias learner
(Naive Bayes) and a high-variance low-bias learner with Bayes optimal
asymptotic error. It extends the underlying strategy of Averaged One-Dependence Estimators (AODE), which relaxes the Naive Bayes independence assumption while retaining many of Naive Bayes' desirable computational and theoretical properties. AnDE further relaxes the independence assumption by generalizing AODE to higher-levels of dependence.
Extensive experimental evaluation shows that the bias-variance trade-off
for Averaged 2-Dependence Estimators results in strong predictive accuracy over a wide range of data sets. It has training time linear with
respect to the number of examples, supports incremental learning, handles directly missing values, and is robust in the face of noise. Beyond
the practical utility of its lower-dimensional variants, AnDE is of interest
in that it demonstrates that it is possible to create low-bias high-variance
generative learners and suggests strategies for developing even more powerful classifiers.},
keywords = {Conditional Probability Estimation and AODE},
publisher = {Springer},
related = {learning-complex-conditional-probabilities-from-data},
url = {http://dx.doi.org/10.1007/s10994-011-5263-6},
}
ABSTRACT Averaged n-Dependence Estimators (AnDE) is an approach to probabilistic classification learning that learns by extrapolation from marginal to full-multivariate probability distributions. It utilizes a single parameter that transforms the approach between a low-variance high-bias learner (Naive Bayes) and a high-variance low-bias learner with Bayes optimal asymptotic error. It extends the underlying strategy of Averaged One-Dependence Estimators (AODE), which relaxes the Naive Bayes independence assumption while retaining many of Naive Bayes' desirable computational and theoretical properties. AnDE further relaxes the independence assumption by generalizing AODE to higher-levels of dependence. Extensive experimental evaluation shows that the bias-variance trade-off for Averaged 2-Dependence Estimators results in strong predictive accuracy over a wide range of data sets. It has training time linear with respect to the number of examples, supports incremental learning, handles directly missing values, and is robust in the face of noise. Beyond the practical utility of its lower-dimensional variants, AnDE is of interest in that it demonstrates that it is possible to create low-bias high-variance generative learners and suggests strategies for developing even more powerful classifiers.

Song, J., Tan, H., Perry, A. J., Akutsu, T., Webb, G. I., Whisstock, J. C., & Pike, R. N.
PLoS ONE, 7(11), Art. no. e50300, 2012.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl12b,
author = {Song, J. and Tan, H. and Perry, A. J. and Akutsu, T. and Webb, G. I. and Whisstock, J. C. and Pike, R. N.},
journal = {PLoS ONE},
title = {PROSPER: An Integrated Feature-Based Tool for Predicting Protease Substrate Cleavage Sites},
year = {2012},
number = {11},
volume = {7},
abstract = {The ability to catalytically cleave protein substrates after synthesis is fundamental for all forms of life. Accordingly, site-specific proteolysis is one of the most important post-translational modifications. The key to understanding the physiological role of a protease is to identify its natural substrate(s). Knowledge of the substrate specificity of a protease can dramatically improve our ability to predict its target protein substrates, but this information must be utilized in an effective manner in order to efficiently identify protein substrates by in silico approaches. To address this problem, we present PROSPER, an integrated feature-based server for in silico identification of protease substrates and their cleavage sites for twenty-four different proteases. PROSPER utilizes established specificity information for these proteases (derived from the MEROPS database) with a machine learning approach to predict protease cleavage sites by using different, but complementary sequence and structure characteristics. Features used by PROSPER include local amino acid sequence profile, predicted secondary structure, solvent accessibility and predicted native disorder. Thus, for proteases with known amino acid specificity, PROSPER provides a convenient, pre-prepared tool for use in identifying protein substrates for the enzymes. Systematic prediction analysis for the twenty-four proteases thus far included in the database revealed that the features we have included in the tool strongly improve performance in terms of cleavage site prediction, as evidenced by their contribution to performance improvement in terms of identifying known cleavage sites in substrates for these enzymes. In comparison with two state-of-the-art prediction tools, PoPS and SitePrediction, PROSPER achieves greater accuracy and coverage. To our knowledge, PROSPER is the first comprehensive server capable of predicting cleavage sites of multiple proteases within a single substrate sequence using machine learning techniques. It is freely available at http://lightning.med.monash.edu.au/PROSPER/.},
articlenumber = {e50300},
keywords = {Bioinformatics},
publisher = {Public Library of Science},
related = {computational-biology},
url = {http://dx.doi.org/10.1371%2Fjournal.pone.0050300},
}
ABSTRACT The ability to catalytically cleave protein substrates after synthesis is fundamental for all forms of life. Accordingly, site-specific proteolysis is one of the most important post-translational modifications. The key to understanding the physiological role of a protease is to identify its natural substrate(s). Knowledge of the substrate specificity of a protease can dramatically improve our ability to predict its target protein substrates, but this information must be utilized in an effective manner in order to efficiently identify protein substrates by in silico approaches. To address this problem, we present PROSPER, an integrated feature-based server for in silico identification of protease substrates and their cleavage sites for twenty-four different proteases. PROSPER utilizes established specificity information for these proteases (derived from the MEROPS database) with a machine learning approach to predict protease cleavage sites by using different, but complementary sequence and structure characteristics. Features used by PROSPER include local amino acid sequence profile, predicted secondary structure, solvent accessibility and predicted native disorder. Thus, for proteases with known amino acid specificity, PROSPER provides a convenient, pre-prepared tool for use in identifying protein substrates for the enzymes. Systematic prediction analysis for the twenty-four proteases thus far included in the database revealed that the features we have included in the tool strongly improve performance in terms of cleavage site prediction, as evidenced by their contribution to performance improvement in terms of identifying known cleavage sites in substrates for these enzymes. In comparison with two state-of-the-art prediction tools, PoPS and SitePrediction, PROSPER achieves greater accuracy and coverage. To our knowledge, PROSPER is the first comprehensive server capable of predicting cleavage sites of multiple proteases within a single substrate sequence using machine learning techniques. It is freely available at http://lightning.med.monash.edu.au/PROSPER/.

Mahmood, K., Webb, G. I., Song, J., Whisstock, J. C., & Konagurthu, A. S.
Nucleic Acids Research, 40(6), Art. no. e44, 2012.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{MahmoodEtAl2012,
author = {Mahmood, K. and Webb, G. I. and Song, J. and Whisstock, J. C. and Konagurthu, A. S.},
journal = {Nucleic Acids Research},
title = {Efficient large-scale protein sequence comparison and gene matching to identify orthologs and co-orthologs},
year = {2012},
number = {6},
volume = {40},
abstract = {Broadly, computational approaches for ortholog assignment is a three steps process: (i) identify all putative homologs between the genomes, (ii) identify gene anchors and (iii) link anchors to identify best gene matches given their order and context. In this article, we engineer two methods to improve two important aspects of this pipeline [specifically steps (ii) and (iii)]. First, computing sequence similarity data [step (i)] is a computationally intensive task for large sequence sets, creating a bottleneck in the ortholog assignment pipeline. We have designed a fast and highly scalable sort-join method (afree) based on k-mer counts to rapidly compare all pairs of sequences in a large protein sequence set to identify putative homologs. Second, availability of complex genomes containing large gene families with prevalence of complex evolutionary events, such as duplications, has made the task of assigning orthologs and co-orthologs difficult. Here, we have developed an iterative graph matching strategy where at each iteration the best gene assignments are identified resulting in a set of orthologs and co-orthologs. We find that the afree algorithm is faster than existing methods and maintains high accuracy in identifying similar genes. The iterative graph matching strategy also showed high accuracy in identifying complex gene relationships. Standalone afree available from http://vbc.med.monash.edu.au/�.�kmahmood/afree. EGM2, complete ortholog assignment pipeline (including afree and the iterative graph matching method) available from http://vbc.med.monash.edu.au/�.�kmahmood/EGM2.},
articlenumber = {e44},
doi = {10.1093/nar/gkr1261},
eprint = {http://nar.oxfordjournals.org/content/early/2011/12/29/nar.gkr1261.full.pdf+html},
keywords = {Bioinformatics},
publisher = {Oxford Journals},
related = {computational-biology},
url = {http://nar.oxfordjournals.org/content/early/2011/12/29/nar.gkr1261.abstract},
}
ABSTRACT Broadly, computational approaches for ortholog assignment is a three steps process: (i) identify all putative homologs between the genomes, (ii) identify gene anchors and (iii) link anchors to identify best gene matches given their order and context. In this article, we engineer two methods to improve two important aspects of this pipeline [specifically steps (ii) and (iii)]. First, computing sequence similarity data [step (i)] is a computationally intensive task for large sequence sets, creating a bottleneck in the ortholog assignment pipeline. We have designed a fast and highly scalable sort-join method (afree) based on k-mer counts to rapidly compare all pairs of sequences in a large protein sequence set to identify putative homologs. Second, availability of complex genomes containing large gene families with prevalence of complex evolutionary events, such as duplications, has made the task of assigning orthologs and co-orthologs difficult. Here, we have developed an iterative graph matching strategy where at each iteration the best gene assignments are identified resulting in a set of orthologs and co-orthologs. We find that the afree algorithm is faster than existing methods and maintains high accuracy in identifying similar genes. The iterative graph matching strategy also showed high accuracy in identifying complex gene relationships. Standalone afree available from http://vbc.med.monash.edu.au/�.�kmahmood/afree. EGM2, complete ortholog assignment pipeline (including afree and the iterative graph matching method) available from http://vbc.med.monash.edu.au/�.�kmahmood/EGM2.

Zheng, F., Webb, G. I., Suraweera, P., & Zhu, L.
Machine Learning, 87(1), 93-125, 2012.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ZhengEtAl12,
author = {Zheng, F. and Webb, G. I. and Suraweera, P. and Zhu, L.},
journal = {Machine Learning},
title = {Subsumption Resolution: An Efficient and Effective Technique for Semi-Naive Bayesian Learning},
year = {2012},
issn = {0885-6125},
number = {1},
pages = {93-125},
volume = {87},
abstract = {Semi-naive Bayesian techniques seek to improve the accuracy of naive
Bayes (NB) by relaxing the attribute independence assumption. We present a new
type of semi-naive Bayesian operation, Subsumption Resolution (SR), which efficiently identifies occurrences of the specialization-generalization relationship and
eliminates generalizations at classification time.We extend SR to Near-Subsumption
Resolution (NSR) to delete near.generalizations in addition to generalizations. We
develop two versions of SR: one that performs SR during training, called eager SR
(ESR), and another that performs SR during testing, called lazy SR (LSR).We inves-
tigate the effect of ESR, LSR, NSR and conventional attribute elimination (BSE) on
NB and Averaged One-Dependence Estimators (AODE), a powerful alternative to
NB. BSE imposes very high training time overheads on NB and AODE accompanied
by varying decreases in classification time overheads. ESR, LSR and NSR impose
high training time and test time overheads on NB. However, LSR imposes no extra
training time overheads and only modest test time overheads on AODE, while ESR
and NSR impose modest training and test time overheads on AODE. Our extensive
experimental comparison on sixty UCI data sets shows that applying BSE, LSR or
NSR to NB significantly improves both zero-one loss and RMSE, while applying
BSE, ESR or NSR to AODE significantly improves zero-one loss and RMSE and
applying LSR to AODE significantly improves zero-one loss. The Friedman test and
Nemenyi test show that AODE with ESR or NSR have a significant zero-one loss and
LibSVM implementation with a grid parameter search on categorical data. AODE
with LSR has a zero-one loss advantage over Logistic Regression and comparable
zero-one loss with LibSVM. Finally, we examine the circumstances under which the
elimination of near-generalizations proves beneficial.},
doi = {10.1007/s10994-011-5275-2},
keywords = {Conditional Probability Estimation and AODE},
publisher = {Springer},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Semi-naive Bayesian techniques seek to improve the accuracy of naive Bayes (NB) by relaxing the attribute independence assumption. We present a new type of semi-naive Bayesian operation, Subsumption Resolution (SR), which efficiently identifies occurrences of the specialization-generalization relationship and eliminates generalizations at classification time.We extend SR to Near-Subsumption Resolution (NSR) to delete near.generalizations in addition to generalizations. We develop two versions of SR: one that performs SR during training, called eager SR (ESR), and another that performs SR during testing, called lazy SR (LSR).We inves- tigate the effect of ESR, LSR, NSR and conventional attribute elimination (BSE) on NB and Averaged One-Dependence Estimators (AODE), a powerful alternative to NB. BSE imposes very high training time overheads on NB and AODE accompanied by varying decreases in classification time overheads. ESR, LSR and NSR impose high training time and test time overheads on NB. However, LSR imposes no extra training time overheads and only modest test time overheads on AODE, while ESR and NSR impose modest training and test time overheads on AODE. Our extensive experimental comparison on sixty UCI data sets shows that applying BSE, LSR or NSR to NB significantly improves both zero-one loss and RMSE, while applying BSE, ESR or NSR to AODE significantly improves zero-one loss and RMSE and applying LSR to AODE significantly improves zero-one loss. The Friedman test and Nemenyi test show that AODE with ESR or NSR have a significant zero-one loss and RMSE advantage over Logistic Regression and a zero-one loss advantage overWeka.s LibSVM implementation with a grid parameter search on categorical data. AODE with LSR has a zero-one loss advantage over Logistic Regression and comparable zero-one loss with LibSVM. Finally, we examine the circumstances under which the elimination of near-generalizations proves beneficial.

Martinez, A., Webb, G. I., Flores, M., & Gamez, J.
Proceedings of the 7th International Conference on Hybrid Artificial Intelligent Systems, Berlin / Heidelberg, pp. 151-162, 2012.

@InProceedings{MartinezEtAl12,
Title = {Non-Disjoint Discretization for Aggregating One-Dependence Estimator Classifiers},
Author = {Martinez, A. and Webb, G. I. and Flores, M. and Gamez, J.},
Booktitle = {Proceedings of the 7th International Conference on Hybrid Artificial Intelligent Systems},
Year = {2012},
Pages = {151-162},
Publisher = {Springer},
ISBN = {978-3-642-28930-9},
Keywords = {Conditional Probability Estimation and AODE and discretization for naive bayes},
Related = {discretization-for-naive-bayes}
}
ABSTRACT 

Song, J., Tan, H., Boyd, S. E., Shen, H., Mahmood, K., Webb, G. I., Akutsu, T., Whisstock, J. C., & Pike, R. N.
Journal of Bioinformatics and Computational Biology, 9(1), 149-178, 2011.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl11,
author = {Song, J. and Tan, H. and Boyd, S. E. and Shen, H. and Mahmood, K. and Webb, G. I. and Akutsu, T. and Whisstock, J. C. and Pike, R. N.},
journal = {Journal of Bioinformatics and Computational Biology},
title = {Bioinformatic Approaches for Predicting Substrates of Proteases},
year = {2011},
number = {1},
pages = {149-178},
volume = {9},
abstract = {Proteases have central roles in "life and death" processes due to their important ability to catalytically hydrolyse protein substrates, usually altering the function and/or activity of the target in the process. Knowledge of the substrate specificity of a protease should, in theory, dramatically improve the ability to predict target protein substrates. However, experimental identification and characterization of protease substrates is often difficult and time-consuming. Thus solving the "substrate identification" problem is fundamental to both understanding protease biology and the development of therapeutics that target specific protease-regulated pathways. In this context, bioinformatic prediction of protease substrates may provide useful and experimentally testable information about novel potential cleavage sites in candidate substrates. In this article, we provide an overview of recent advances in developing bioinformatic approaches for predicting protease substrate cleavage sites and identifying novel putative substrates. We discuss the advantages and drawbacks of the current methods and detail how more accurate models can be built by deriving multiple sequence and structural features of substrates. We also provide some suggestions about how future studies might further improve the accuracy of protease substrate specificity prediction.},
audit-trail = {http://www.worldscinet.com/jbcb/00/0001/S0219720011005288.html},
doi = {10.1142/S0219720011005288},
keywords = {Bioinformatics},
publisher = {World Scientific},
related = {computational-biology},
}
ABSTRACT Proteases have central roles in "life and death" processes due to their important ability to catalytically hydrolyse protein substrates, usually altering the function and/or activity of the target in the process. Knowledge of the substrate specificity of a protease should, in theory, dramatically improve the ability to predict target protein substrates. However, experimental identification and characterization of protease substrates is often difficult and time-consuming. Thus solving the "substrate identification" problem is fundamental to both understanding protease biology and the development of therapeutics that target specific protease-regulated pathways. In this context, bioinformatic prediction of protease substrates may provide useful and experimentally testable information about novel potential cleavage sites in candidate substrates. In this article, we provide an overview of recent advances in developing bioinformatic approaches for predicting protease substrate cleavage sites and identifying novel putative substrates. We discuss the advantages and drawbacks of the current methods and detail how more accurate models can be built by deriving multiple sequence and structural features of substrates. We also provide some suggestions about how future studies might further improve the accuracy of protease substrate specificity prediction.

Ng, N. M., Pierce, J. D., Webb, G. I., Ratnikov, B. I., Wijeyewickrema, L. C., Duncan, R. C., Robertson, A. L., Bottomley, S. P., Boyd, S. E., & Pike, R. N.
Biochemistry, 50(48), 10499-10507, 2011.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{NgEtAl11,
author = {N. M. Ng and Pierce, J. D. and Webb, G. I. and Ratnikov, B. I. and Wijeyewickrema, L. C. and Duncan, R. C. and Robertson, A. L. and Bottomley, S. P. and Boyd, S. E. and Pike, R. N.},
journal = {Biochemistry},
title = {Discovery of Amino Acid Motifs for Thrombin Cleavage and Validation Using a Model Substrate},
year = {2011},
number = {48},
pages = {10499-10507},
volume = {50},
abstract = {Understanding the active site preferences of an enzyme is critical to the design of effective inhibitors and to gaining insights into its mechanisms of action on substrates. While the subsite specificity of thrombin is understood, it is not clear whether the enzyme prefers individual amino acids at each subsite in isolation or prefers to cleave combinations of amino acids as a motif. To investigate whether preferred peptide motifs for cleavage could be identified for thrombin, we exposed a phage-displayed peptide library to thrombin. The resulting preferentially cleaved substrates were analyzed using the technique of association rule discovery. The results revealed that thrombin selected for amino acid motifs in cleavage sites. The contribution of these hypothetical motifs to substrate cleavage efficiency was further investigated using the B1 IgG-binding domain of streptococcal protein G as a model substrate. Introduction of a P2.P1. LRS thrombin cleavage sequence within a major loop of the protein led to cleavage of the protein by thrombin, with the cleavage efficiency increasing with the length of the loop. Introduction of further P3.P1 and P1.P1..P3. amino acid motifs into the loop region yielded greater cleavage efficiencies, suggesting that the susceptibility of a protein substrate to cleavage by thrombin is influenced by these motifs, perhaps because of cooperative effects between subsites closest to the scissile peptide bond.},
doi = {10.1021/bi201333g},
eprint = {http://pubs.acs.org/doi/pdf/10.1021/bi201333g},
keywords = {Bioinformatics},
related = {computational-biology},
url = {http://pubs.acs.org/doi/abs/10.1021/bi201333g},
}
ABSTRACT Understanding the active site preferences of an enzyme is critical to the design of effective inhibitors and to gaining insights into its mechanisms of action on substrates. While the subsite specificity of thrombin is understood, it is not clear whether the enzyme prefers individual amino acids at each subsite in isolation or prefers to cleave combinations of amino acids as a motif. To investigate whether preferred peptide motifs for cleavage could be identified for thrombin, we exposed a phage-displayed peptide library to thrombin. The resulting preferentially cleaved substrates were analyzed using the technique of association rule discovery. The results revealed that thrombin selected for amino acid motifs in cleavage sites. The contribution of these hypothetical motifs to substrate cleavage efficiency was further investigated using the B1 IgG-binding domain of streptococcal protein G as a model substrate. Introduction of a P2.P1. LRS thrombin cleavage sequence within a major loop of the protein led to cleavage of the protein by thrombin, with the cleavage efficiency increasing with the length of the loop. Introduction of further P3.P1 and P1.P1..P3. amino acid motifs into the loop region yielded greater cleavage efficiencies, suggesting that the susceptibility of a protein substrate to cleavage by thrombin is influenced by these motifs, perhaps because of cooperative effects between subsites closest to the scissile peptide bond.

Ting, K. M., Wells, J., Tan, S., Teng, S., & Webb, G. I.
Machine Learning, 82(3), 375-397, 2011.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{TingEtAl11,
author = {Ting, K. M. and Wells, J. and Tan, S. and Teng, S. and Webb, G. I.},
journal = {Machine Learning},
title = {Feature-subspace aggregating: Ensembles for stable and unstable learners},
year = {2011},
issn = {0885-6125},
number = {3},
pages = {375-397},
volume = {82},
abstract = {This paper introduces a new ensemble approach, Feature-Subspace Aggregating (Feating), which builds local models instead of global models. Feating is a generic ensemble approach that can enhance the predictive performance of both stable and unstable learners. In contrast, most existing ensemble approaches can improve the predictive performance of unstable learners only. Our analysis shows that the new approach reduces the execution time to generate a model in an ensemble through an increased level of localisation in Feating. Our empirical evaluation shows that Feating performs significantly better than Boosting, Random Subspace and Bagging in terms of predictive accuracy, when a stable learner SVM is used as the base learner. The speed up achieved by Feating makes feasible SVM ensembles that would otherwise be infeasible for large data sets. When SVM is the preferred base learner, we show that Feating SVM performs better than Boosting decision trees and Random Forests. We further demonstrate that Feating also substantially reduces the error of another stable learner, k-nearest neighbour, and an unstable learner, decision tree.},
doi = {10.1007/s10994-010-5224-5},
keywords = {Feating and Multiboosting and Boosting},
publisher = {Springer},
related = {feating},
}
ABSTRACT This paper introduces a new ensemble approach, Feature-Subspace Aggregating (Feating), which builds local models instead of global models. Feating is a generic ensemble approach that can enhance the predictive performance of both stable and unstable learners. In contrast, most existing ensemble approaches can improve the predictive performance of unstable learners only. Our analysis shows that the new approach reduces the execution time to generate a model in an ensemble through an increased level of localisation in Feating. Our empirical evaluation shows that Feating performs significantly better than Boosting, Random Subspace and Bagging in terms of predictive accuracy, when a stable learner SVM is used as the base learner. The speed up achieved by Feating makes feasible SVM ensembles that would otherwise be infeasible for large data sets. When SVM is the preferred base learner, we show that Feating SVM performs better than Boosting decision trees and Random Forests. We further demonstrate that Feating also substantially reduces the error of another stable learner, k-nearest neighbour, and an unstable learner, decision tree.

Webb, G. I.
WIREs Data Mining and Knowledge Discovery, 1(3), 183-192, 2011.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{Webb11,
author = {Webb, G. I.},
journal = {WIREs Data Mining and Knowledge Discovery},
title = {Filtered-top-k Association Discovery},
year = {2011},
number = {3},
pages = {183-192},
volume = {1},
abstract = {Association mining has been one of the most intensively researched areas of data mining. However, direct uptake of the resulting technologies has been relatively low. This paper examines some of the reasons why the dominant paradigms in association mining have not lived up to their promise, and argues that a powerful alternative is provided by top-k techniques coupled with appropriate statistical and other filtering.},
doi = {10.1002/widm.28},
keywords = {Association Rule Discovery and statistically sound discovery},
publisher = {Wiley},
related = {filtered-top-k-association-discovery},
}
ABSTRACT Association mining has been one of the most intensively researched areas of data mining. However, direct uptake of the resulting technologies has been relatively low. This paper examines some of the reasons why the dominant paradigms in association mining have not lived up to their promise, and argues that a powerful alternative is provided by top-k techniques coupled with appropriate statistical and other filtering.

Mahmood, K., Konagurthu, A. S., Song, J., Buckle, A. M., Webb, G. I., & Whisstock, J. C.
Bioinformatics, 26(17), 2076-2084, 2010.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{MahmoodEtAl10,
author = {Mahmood, K. and Konagurthu, A. S. and Song, J. and Buckle, A. M. and Webb, G. I. and Whisstock, J. C.},
journal = {Bioinformatics},
title = {EGM: Encapsulated Gene-by-Gene Matching to Identify Gene Orthologs and Homologous Segments in Genomes},
year = {2010},
number = {17},
pages = {2076-2084},
volume = {26},
abstract = {Motivation: Identification of functionally equivalent genes in different species is essential to understand the evolution of biological pathways and processes. At the same time, identification of strings of conserved orthologous genes helps identify complex genomic rearrangements across different organisms. Such an insight is particularly useful, for example, in the transfer of experimental results between different experimental systems such as Drosophila and mammals.
Results: Here we describe the Encapsulated Gene-by-gene Matching (EGM) approach, a method that employs a graph matching strategy to identify gene orthologs and conserved gene segments. Given a pair of genomes, EGM constructs a global gene match for all genes taking into account gene context and family information. The Hungarian method for identifying the maximum weight matching in bipartite graphs is employed, where the resulting matching reveals one-to-one correspondences between nodes (genes) in a manner that maximizes the gene similarity and context.
Conclusion: We tested our approach by performing several comparisons including a detailed Human v Mouse genome mapping. We find that the algorithm is robust and sensitive in detecting orthologs and conserved gene segments. EGM can sensitively detect rearrangements within large and small chromosomal segments. The EGM tool is fully automated and easy to use compared to other more complex methods that also require extensive manual intervention and input.},
audit-trail = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/26/6/752},
doi = {10.1093/bioinformatics/btq339},
keywords = {Bioinformatics},
publisher = {Oxford Univ Press},
related = {computational-biology},
}
ABSTRACT Motivation: Identification of functionally equivalent genes in different species is essential to understand the evolution of biological pathways and processes. At the same time, identification of strings of conserved orthologous genes helps identify complex genomic rearrangements across different organisms. Such an insight is particularly useful, for example, in the transfer of experimental results between different experimental systems such as Drosophila and mammals. Results: Here we describe the Encapsulated Gene-by-gene Matching (EGM) approach, a method that employs a graph matching strategy to identify gene orthologs and conserved gene segments. Given a pair of genomes, EGM constructs a global gene match for all genes taking into account gene context and family information. The Hungarian method for identifying the maximum weight matching in bipartite graphs is employed, where the resulting matching reveals one-to-one correspondences between nodes (genes) in a manner that maximizes the gene similarity and context. Conclusion: We tested our approach by performing several comparisons including a detailed Human v Mouse genome mapping. We find that the algorithm is robust and sensitive in detecting orthologs and conserved gene segments. EGM can sensitively detect rearrangements within large and small chromosomal segments. The EGM tool is fully automated and easy to use compared to other more complex methods that also require extensive manual intervention and input.

Webb, G. I.
ACM Transactions on Knowledge Discovery from Data, 4, Art. no. 3, 2010.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb10,
author = {Webb, G. I.},
journal = {{ACM} Transactions on Knowledge Discovery from Data},
title = {Self-Sufficient Itemsets: An Approach to Screening Potentially Interesting Associations Between Items},
year = {2010},
volume = {4},
abstract = {Self-sufficient itemsets are those whose frequency cannot explained solely by the frequency of either their subsets or of their
supersets. We argue that itemsets that are not
self-sufficient will often be of little interest to the data
analyst, as their frequency should be expected once that of the
itemsets on which their frequency depends is known. We present
statistical tests for statistically sound discovery of
self-sufficient itemsets, and computational techniques that allow
those tests to be applied as a post-processing step for any itemset
discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.},
articlenumber = {3},
doi = {10.1145/1644873.1644876},
issue = {1},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
publisher = {ACM},
related = {filtered-top-k-association-discovery},
url = {http://dl.acm.org/authorize?270473},
}
ABSTRACT Self-sufficient itemsets are those whose frequency cannot explained solely by the frequency of either their subsets or of their supersets. We argue that itemsets that are not self-sufficient will often be of little interest to the data analyst, as their frequency should be expected once that of the itemsets on which their frequency depends is known. We present statistical tests for statistically sound discovery of self-sufficient itemsets, and computational techniques that allow those tests to be applied as a post-processing step for any itemset discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.

ICDM 2010, The 10th IEEE International Conference on Data Mining
Webb, G. I., Liu, B., Zhang, C., Gunopulos, D., & Wu, X. (Ed).
IEEE Computer Society, 2010.
[Bibtex]  → Access on publisher site

@Proceedings{WebbICDM2010,
Title = {ICDM 2010, The 10th {IEEE} International Conference on Data Mining},
Year = {2010},
Editor = {Webb, G.I. and Liu, B. and Zhang, C. and Gunopulos, D. and Wu, X.},
Publisher = {IEEE Computer Society},
Url = {https://www.computer.org/csdl/proceedings/icdm/2010/4256/00/index.html},
}
ABSTRACT 

Song, J., Tan, H., Shen, H., Mahmood, K., Boyd, S. E., Webb, G. I., Akutsu, T., & Whisstock, J. C.
Bioinformatics, 26(6), 752-760, 2010.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl10,
author = {Song, J. and Tan, H. and Shen, H. and Mahmood, K. and Boyd, S. E. and Webb, G. I. and Akutsu, T. and Whisstock, J. C.},
journal = {Bioinformatics},
title = {Cascleave: Towards More Accurate Prediction of Caspase Substrate Cleavage Sites},
year = {2010},
number = {6},
pages = {752-760},
volume = {26},
abstract = {Motivation: The caspase family of cysteine proteases play essential roles in key biological processes such as programmed cell death, differentiation, proliferation, necrosis and inflammation. The complete repertoire of caspase substrates remains to be fully characterized. Accordingly, systematic computational screening studies of caspase substrate cleavage sites may provide insight into the substrate specificity of caspases and further facilitating the discovery of putative novel substrates. Results: In this article we develop an approach (termed Cascleave) to predict both classical (i.e. following a P1 Asp) and non-typical caspase cleavage sites. When using local sequence-derived profiles, Cascleave successfully predicted 82.2% of the known substrate cleavage sites, with a Matthews correla tion coefficient (MCC) of 0.667. We found that prediction performance could be further improved by incorporating information such as predicted solvent accessibility and whether a cleavage sequence lies in a region that is most likely natively unstructured. Novel bi-profile Bayesian signatures were found to significantly improve the prediction performance and yielded the best performance with an overall accuracy of 87.6% and a MCC of 0.747, which is higher accuracy than published methods that essentially rely on amino acid sequence alone. It is anticipated that Cascleave will be a powerful tool for predicting novel substrate cleavage sites of caspases and shedding new insights on the unknown caspase-substrate interactivity relationship.},
audit-trail = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq339v1},
doi = {10.1093/bioinformatics/btq043},
keywords = {Bioinformatics},
publisher = {Oxford Univ Press},
related = {computational-biology},
}
ABSTRACT Motivation: The caspase family of cysteine proteases play essential roles in key biological processes such as programmed cell death, differentiation, proliferation, necrosis and inflammation. The complete repertoire of caspase substrates remains to be fully characterized. Accordingly, systematic computational screening studies of caspase substrate cleavage sites may provide insight into the substrate specificity of caspases and further facilitating the discovery of putative novel substrates. Results: In this article we develop an approach (termed Cascleave) to predict both classical (i.e. following a P1 Asp) and non-typical caspase cleavage sites. When using local sequence-derived profiles, Cascleave successfully predicted 82.2% of the known substrate cleavage sites, with a Matthews correla tion coefficient (MCC) of 0.667. We found that prediction performance could be further improved by incorporating information such as predicted solvent accessibility and whether a cleavage sequence lies in a region that is most likely natively unstructured. Novel bi-profile Bayesian signatures were found to significantly improve the prediction performance and yielded the best performance with an overall accuracy of 87.6% and a MCC of 0.747, which is higher accuracy than published methods that essentially rely on amino acid sequence alone. It is anticipated that Cascleave will be a powerful tool for predicting novel substrate cleavage sites of caspases and shedding new insights on the unknown caspase-substrate interactivity relationship.

Encyclopedia of Machine Learning
Sammut, C., & Webb, G. I. (Ed).
Berlin: Springer, 2010.
[Bibtex]  → Access on publisher site

@Book{SammutWebb10,
Title = {Encyclopedia of Machine Learning},
Editor = {Sammut, C. and Webb, G.I.},
Publisher = {Springer},
Year = {2010},
Url = {http://www.springer.com/us/book/9780387307688}
}
ABSTRACT 

Liu, B., Yang, Y., Webb, G. I., & Boughton, J.
Proceedings of the 13th Pacific-Asia Conference, PAKDD 2009, Berlin/Heidelberg, pp. 302-313, 2009.
[Bibtex]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{LiuYangWebbBoughton09,
author = {Liu, B. and Yang, Y. and Webb, G. I. and Boughton, J.},
booktitle = {Proceedings of the 13th {Pacific}-{Asia} Conference, PAKDD 2009},
title = {A Comparative Study of Bandwidth Choice in Kernel Density Estimation for Naive Bayesian Classification},
year = {2009},
pages = {302-313},
publisher = {Springer},
keywords = {Conditional Probability Estimation and AODE and Discretization for Naive Bayes},
location = {Bangkok, Thailand},
related = {discretization-for-naive-bayes},
}
ABSTRACT 

Hui, B., Yang, Y., & Webb, G. I.
Machine Learning, 77(1), 61-102, 2009.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{HuiYangWebb09,
author = {Hui, B. and Yang, Y. and Webb, G. I.},
journal = {Machine Learning},
title = {Anytime Classification for a Pool of Instances},
year = {2009},
number = {1},
pages = {61-102},
volume = {77},
abstract = {In many real-world applications of classification learning, such as
credit card transaction vetting or classification embedded in sensor
nodes, multiple instances simultaneously require classification under
computational resource constraints such as limited time or limited
battery capacity. In such a situation, available computational
resources should be allocated across the instances in order to
optimize the overall classification efficacy and efficiency. We
propose a novel anytime classification framework, Scheduling Anytime
Averaged Probabilistic Estimators (SAAPE), which is capable of
classifying a pool of instances, delivering accurate results whenever
interrupted and optimizing the collective classification
performance. Following the practice of our previous anytime
classification system AAPE, SAAPE runs a sequence of very efficient
Bayesian probabilistic classifiers to classify each single
instance. Furthermore, SAAPE implements seven alternative scheduling
schemes to decide which instance gets available computational
resources next such that a new classifier can be applied to refine its
classification. We formally present each scheduling scheme's
definition, rationale and time complexity. We conduct large-scale
experiments using 60 benchmark data sets and diversified statistical
tests to evaluate SAAPE's performance on zero-one loss classification
as well as on probability estimation. We analyze each scheduling
understandings and empirical observations. Consequently we identify
effective scheduling schemes that enable SAAPE to accomplish accurate
anytime classification for a pool of instances.},
audit-trail = {http://dx.doi.org/10.1007/s10994-009-5118-6},
doi = {10.1007/s10994-009-5118-6},
keywords = {Conditional Probability Estimation and AODE},
publisher = {Springer},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT In many real-world applications of classification learning, such as credit card transaction vetting or classification embedded in sensor nodes, multiple instances simultaneously require classification under computational resource constraints such as limited time or limited battery capacity. In such a situation, available computational resources should be allocated across the instances in order to optimize the overall classification efficacy and efficiency. We propose a novel anytime classification framework, Scheduling Anytime Averaged Probabilistic Estimators (SAAPE), which is capable of classifying a pool of instances, delivering accurate results whenever interrupted and optimizing the collective classification performance. Following the practice of our previous anytime classification system AAPE, SAAPE runs a sequence of very efficient Bayesian probabilistic classifiers to classify each single instance. Furthermore, SAAPE implements seven alternative scheduling schemes to decide which instance gets available computational resources next such that a new classifier can be applied to refine its classification. We formally present each scheduling scheme's definition, rationale and time complexity. We conduct large-scale experiments using 60 benchmark data sets and diversified statistical tests to evaluate SAAPE's performance on zero-one loss classification as well as on probability estimation. We analyze each scheduling scheme's advantage and disadvantage according to both theoretical understandings and empirical observations. Consequently we identify effective scheduling schemes that enable SAAPE to accomplish accurate anytime classification for a pool of instances.

Song, J., Tan, H., Mahmood, K., Law, R. H. P., Buckle, A. M., Webb, G. I., Akutsu, T., & Whisstock, J. C.
PLoS ONE, 4(9), Art. no. e7072, 2009.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{SongEtAl09,
author = {Song, J. and Tan, H. and Mahmood, K. and Law, R. H. P. and Buckle, A. M. and Webb, G. I. and Akutsu, T. and Whisstock, J. C.},
journal = {PLoS ONE},
title = {Prodepth: Predict Residue Depth by Support Vector Regression Approach from Protein Sequences Only},
year = {2009},
number = {9},
volume = {4},
abstract = {Residue depth (RD) is a solvent exposure measure that complements the information provided by conventional accessible surface area (ASA) and describes to what extent a residue is buried in the protein structure space. Previous studies have established that RD is correlated with several protein properties, such as protein stability, residue conservation and amino acid types. Accurate prediction of RD has many potentially important applications in the field of structural bioinformatics, for example, facilitating the identification of functionally important residues, or residues in the folding nucleus, or enzyme active sites from sequence information. In this work, we introduce an efficient approach that uses support vector regression to quantify the relationship between RD and protein sequence. We systematically investigated eight different sequence encoding schemes including both local and global sequence characteristics and examined their respective prediction performances. For the objective evaluation of our approach, we used 5-fold cross-validation to assess the prediction accuracies and showed that the overall best performance could be achieved with a correlation coefficient (CC) of 0.71 between the observed and predicted RD values and a root mean square error (RMSE) of 1.74, after incorporating the relevant multiple sequence features. The results suggest that residue depth could be reliably predicted solely from protein primary sequences: local sequence environments are the major determinants, while global sequence features could influence the prediction performance marginally. We highlight two examples as a comparison in order to illustrate the applicability of this approach. We also discuss the potential implications of this new structural parameter in the field of protein structure prediction and homology modeling. This method might prove to be a powerful tool for sequence analysis.},
articlenumber = {e7072},
audit-trail = {http://www.plosone.org/article/info:doi/10.1371/journal.pone.0007072},
doi = {10.1371/journal.pone.0007072},
keywords = {Bioinformatics},
publisher = {PLOS},
related = {computational-biology},
}
ABSTRACT Residue depth (RD) is a solvent exposure measure that complements the information provided by conventional accessible surface area (ASA) and describes to what extent a residue is buried in the protein structure space. Previous studies have established that RD is correlated with several protein properties, such as protein stability, residue conservation and amino acid types. Accurate prediction of RD has many potentially important applications in the field of structural bioinformatics, for example, facilitating the identification of functionally important residues, or residues in the folding nucleus, or enzyme active sites from sequence information. In this work, we introduce an efficient approach that uses support vector regression to quantify the relationship between RD and protein sequence. We systematically investigated eight different sequence encoding schemes including both local and global sequence characteristics and examined their respective prediction performances. For the objective evaluation of our approach, we used 5-fold cross-validation to assess the prediction accuracies and showed that the overall best performance could be achieved with a correlation coefficient (CC) of 0.71 between the observed and predicted RD values and a root mean square error (RMSE) of 1.74, after incorporating the relevant multiple sequence features. The results suggest that residue depth could be reliably predicted solely from protein primary sequences: local sequence environments are the major determinants, while global sequence features could influence the prediction performance marginally. We highlight two examples as a comparison in order to illustrate the applicability of this approach. We also discuss the potential implications of this new structural parameter in the field of protein structure prediction and homology modeling. This method might prove to be a powerful tool for sequence analysis.

Yang, Y., & Webb, G. I.
Machine Learning, 74(1), 39-74, 2009.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{YangWebb09,
author = {Yang, Y. and Webb, G. I.},
journal = {Machine Learning},
title = {Discretization for Naive-Bayes Learning: Managing Discretization Bias and Variance},
year = {2009},
number = {1},
pages = {39-74},
volume = {74},
abstract = {Quantitative attributes are usually discretized in Naive-Bayes learning. We establish simple conditions under which discretization is equivalent to use of the true probability density function during naive-Bayes learning. The use of different discretization techniques can be expected to affect the classification bias and variance of generated naive-Bayes classifiers, effects we name discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error. In particular, we supply insights into managing discretization bias and variance by adjusting the number of intervals and the number of training instances contained in each interval. We accordingly propose proportional discretization and fixed frequency discretization, two efficient unsupervised discretization methods that are able to effectively manage discretization bias and variance. We evaluate our new techniques against four key discretization methods for naive-Bayes classifiers. The experimental results support our theoretical analyses by showing that with statistically significant frequency, naive-Bayes classifiers trained on data discretized by our new methods are able to achieve lower classification error than those trained on data discretized by current established discretization methods.},
audit-trail = {DOI 10.1007/s10994-008-5083-5},
doi = {10.1007/s10994-008-5083-5},
keywords = {Discretization for Naive Bayes and Conditional Probability Estimation and AODE},
publisher = {Springer},
related = {discretization-for-naive-bayes},
}
ABSTRACT Quantitative attributes are usually discretized in Naive-Bayes learning. We establish simple conditions under which discretization is equivalent to use of the true probability density function during naive-Bayes learning. The use of different discretization techniques can be expected to affect the classification bias and variance of generated naive-Bayes classifiers, effects we name discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error. In particular, we supply insights into managing discretization bias and variance by adjusting the number of intervals and the number of training instances contained in each interval. We accordingly propose proportional discretization and fixed frequency discretization, two efficient unsupervised discretization methods that are able to effectively manage discretization bias and variance. We evaluate our new techniques against four key discretization methods for naive-Bayes classifiers. The experimental results support our theoretical analyses by showing that with statistically significant frequency, naive-Bayes classifiers trained on data discretized by our new methods are able to achieve lower classification error than those trained on data discretized by current established discretization methods.

Ting, K. M., Wells, J. R., Tan, S. C., Teng, S. W., & Webb, G. I.
Proceedings of the 8th International Workshop on Multiple Classifier Systems, MCS 2009, Berlin, pp. 364-374, 2009.
[Bibtex]  → Access on publisher site  → Related papers and software

@InProceedings{TingEtAl09,
author = {Ting, K. M. and Wells, J. R. and Tan, S. C. and Teng, S. W. and Webb, G. I.},
booktitle = {Proceedings of the 8th International Workshop on Multiple Classifier Systems, MCS 2009},
title = {FaSS: Ensembles for Stable Learners},
year = {2009},
pages = {364-374},
publisher = {Springer},
doi = {10.1007/978-3-642-02326-2_37},
keywords = {Feating and Multiboosting and Boosting},
location = {Reykjavik, Iceland},
related = {feating},
}
ABSTRACT 

Novak, P., Lavrac, N., & Webb, G. I.
Journal of Machine Learning Research, 10, 377-403, 2009.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{NovakLavracWebb09,
author = {Novak, P. and Lavrac, N. and Webb, G. I.},
journal = {Journal of Machine Learning Research},
title = {Supervised Descriptive Rule Discovery: A Unifying Survey of Contrast Set, Emerging Pattern and Subgroup Mining},
year = {2009},
pages = {377-403},
volume = {10},
abstract = {This paper gives a survey of contrast set mining (CSM), emerging pattern mining (EPM), and subgroup discovery (SD) in a unifying framework named supervised descriptive rule discovery. While all these research areas aim at discovering patterns in the form of rules induced from labeled data, they use different terminology and task definitions, claim to have different goals, claim to use different rule learning heuristics, and use different means for selecting subsets of induced patterns. This paper contributes a novel understanding of these subareas of data mining by presenting a unified terminology, by explaining the apparent differences between the learning tasks as variants of a unique supervised descriptive rule discovery task and by exploring the apparent differences between the approaches. It also shows that various rule learning heuristics used in CSM, EPM and SD algorithms all aim at optimizing a trade off between rule coverage and precision. The commonalities (and differences) between the approaches are showcased on a selection of best known variants of CSM, EPM and SD algorithms. The paper also provides a critical survey of existing supervised descriptive rule discovery visualization methods.},
keywords = {Association Rule Discovery and OPUS},
related = {filtered-top-k-association-discovery},
url = {http://www.jmlr.org/papers/volume10/kralj-novak09a/kralj-novak09a.pdf},
}
ABSTRACT This paper gives a survey of contrast set mining (CSM), emerging pattern mining (EPM), and subgroup discovery (SD) in a unifying framework named supervised descriptive rule discovery. While all these research areas aim at discovering patterns in the form of rules induced from labeled data, they use different terminology and task definitions, claim to have different goals, claim to use different rule learning heuristics, and use different means for selecting subsets of induced patterns. This paper contributes a novel understanding of these subareas of data mining by presenting a unified terminology, by explaining the apparent differences between the learning tasks as variants of a unique supervised descriptive rule discovery task and by exploring the apparent differences between the approaches. It also shows that various rule learning heuristics used in CSM, EPM and SD algorithms all aim at optimizing a trade off between rule coverage and precision. The commonalities (and differences) between the approaches are showcased on a selection of best known variants of CSM, EPM and SD algorithms. The paper also provides a critical survey of existing supervised descriptive rule discovery visualization methods.

Multi-Strategy Ensemble Learning, Ensembles of Bayesian Classifiers, and the Problem of False Discoveries.
Webb, G. I.
Proceedings of the Seventh Australasian Data Mining Conference (AusDM 2008), pp. 15, 2008.
[Bibtex]

@InProceedings{Webb08b,
author = {Webb, G. I.},
booktitle = {Proceedings of the Seventh Australasian Data Mining Conference (AusDM 2008)},
title = {Multi-Strategy Ensemble Learning, Ensembles of Bayesian Classifiers, and the Problem of False Discoveries},
year = {2008},
pages = {15},
publisher = {Australian Computer Society},
notes = {Abstract},
}
ABSTRACT 

Webb, G. I.
Machine Learning, 71(2-3), 307-323, 2008.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb08,
author = {Webb, G. I.},
journal = {Machine Learning},
title = {Layered Critical Values: A Powerful Direct-Adjustment Approach to Discovering Significant Patterns},
year = {2008},
number = {2-3},
pages = {307-323},
volume = {71},
abstract = {Standard pattern discovery techniques, such as association rules, suffer an extreme risk of finding very large numbers of spurious patterns for many knowledge discovery tasks. The direct-adjustment approach to controlling this risk applies a statistical test during the discovery process, using a critical value adjusted to take account of the size of the search space. However, a problem with the direct-adjustment strategy is that it may discard numerous true patterns. This paper investigates the assignment of different critical values to different areas of the search space as an approach to alleviating this problem, using a variant of a technique originally developed for other purposes. This approach is shown to be effective at increasing the number of discoveries while still maintaining strict control over the risk of false discoveries.},
audit-trail = {DOI 10.1007/s10994-008-5046-x},
doi = {10.1007/s10994-008-5046-x},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
notes = {Technical Note},
publisher = {Springer},
related = {statistically-sound-association-discovery},
}
ABSTRACT Standard pattern discovery techniques, such as association rules, suffer an extreme risk of finding very large numbers of spurious patterns for many knowledge discovery tasks. The direct-adjustment approach to controlling this risk applies a statistical test during the discovery process, using a critical value adjusted to take account of the size of the search space. However, a problem with the direct-adjustment strategy is that it may discard numerous true patterns. This paper investigates the assignment of different critical values to different areas of the search space as an approach to alleviating this problem, using a variant of a technique originally developed for other purposes. This approach is shown to be effective at increasing the number of discoveries while still maintaining strict control over the risk of false discoveries.

Webb, G. I.
Data Mining and Knowledge Discovery, 15(1), 1-2, 2007.
[Bibtex]  → Access on publisher site

@Article{Webb07b,
Title = {Tenth Anniversary Edition Editorial},
Author = {Webb, G. I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2007},
Number = {1},
Pages = {1-2},
Volume = {15},
Doi = {10.1007/s10618-007-0075-9},
Publisher = {Springer}
}
ABSTRACT 

Faux, N. G., Huttley, G. A., Mahmood, K., Webb, G. I., Garcia de la Banda, M., & Whisstock, J. C.
Genome Research, 17(1), 1118-1127, 2007.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{FauxHuttleyMahmoodWebbGarciaWhisstock07,
author = {Faux, N. G. and Huttley, G. A. and Mahmood, K. and Webb, G. I. and Garcia de la Banda, M. and Whisstock, J. C.},
journal = {Genome Research},
title = {RCPdb: An evolutionary classification and codon usage database for repeat-containing proteins},
year = {2007},
number = {1},
pages = {1118-1127},
volume = {17},
abstract = {Over 3% of human proteins contain single amino acid repeats (repeat-containing proteins, RCPs). Many repeats (homopeptides) localize to important proteins involved in transcription, and the expansion of certain repeats, in particular poly-Q and poly-A tracts, can also lead to the development of neurological diseases. Previous studies have suggested that the homopeptide makeup is a result of the presence of G+C-rich tracts in the encoding genes and that expansion occurs via replication slippage. Here, we have performed a large-scale genomic analysis of the variation of the genes encoding RCPs in 13 species and present these data in an online database (http://repeats.med.monash.edu.au/genetic_analysis/). This resource allows rapid comparison and analysis of RCPs, homopeptides, and their underlying genetic tracts across the eukaryotic species considered. We report three major findings. First, there is a bias for a small subset of codons being reiterated within homopeptides, and there is no G+C or A+T bias relative to the organism�s transcriptome. Second, single base pair transversions from the homocodon are unusually common and may represent a mechanism of reducing the rate of homopeptide mutations. Third, homopeptides that are conserved across different species lie within regions that are under stronger purifying selection in contrast to nonconserved homopeptides.},
doi = {10.1101/gr.6255407},
keywords = {Bioinformatics},
publisher = {Cold Spring Harbor Laboratory Press, ISSN 1088-9051/07},
related = {computational-biology},
}
ABSTRACT Over 3% of human proteins contain single amino acid repeats (repeat-containing proteins, RCPs). Many repeats (homopeptides) localize to important proteins involved in transcription, and the expansion of certain repeats, in particular poly-Q and poly-A tracts, can also lead to the development of neurological diseases. Previous studies have suggested that the homopeptide makeup is a result of the presence of G+C-rich tracts in the encoding genes and that expansion occurs via replication slippage. Here, we have performed a large-scale genomic analysis of the variation of the genes encoding RCPs in 13 species and present these data in an online database (http://repeats.med.monash.edu.au/genetic_analysis/). This resource allows rapid comparison and analysis of RCPs, homopeptides, and their underlying genetic tracts across the eukaryotic species considered. We report three major findings. First, there is a bias for a small subset of codons being reiterated within homopeptides, and there is no G+C or A+T bias relative to the organism�s transcriptome. Second, single base pair transversions from the homocodon are unusually common and may represent a mechanism of reducing the rate of homopeptide mutations. Third, homopeptides that are conserved across different species lie within regions that are under stronger purifying selection in contrast to nonconserved homopeptides.

Webb, G. I.
Machine Learning, 68(1), 1-33, 2007.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb07,
author = {Webb, G. I.},
journal = {Machine Learning},
title = {Discovering Significant Patterns},
year = {2007},
number = {1},
pages = {1-33},
volume = {68},
abstract = {Exploratory pattern discovery techniques, such as association rule discovery, explore large search spaces of potential patterns to find those that satisfy some user-specified constraints. Due to the large number of patterns considered, they suffer from an extreme risk of type-1 error, that is, of finding patterns that appear due to chance alone to satisfy the constraints on the sample data. This paper proposes techniques to overcome this problem by applying well-established statistical practices. These allow the user to enforce a strict upper limit on the risk of experimentwise error. Empirical studies demonstrate that standard exploratory pattern discovery techniques can discover numerous spurious patterns when applied to random data and when applied to real-world data result in large numbers of patterns that are rejected when subjected to statistical evaluation on holdout data. They also reveal that modification of the pattern discovery process to anticipate subsequent statistical evaluation can increase the number of patterns that are accepted by statistical evaluation on holdout data.},
audit-trail = {subject to revisions},
doi = {10.1007/s10994-007-5006-x},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
publisher = {Springer},
related = {statistically-sound-association-discovery},
}
ABSTRACT Exploratory pattern discovery techniques, such as association rule discovery, explore large search spaces of potential patterns to find those that satisfy some user-specified constraints. Due to the large number of patterns considered, they suffer from an extreme risk of type-1 error, that is, of finding patterns that appear due to chance alone to satisfy the constraints on the sample data. This paper proposes techniques to overcome this problem by applying well-established statistical practices. These allow the user to enforce a strict upper limit on the risk of experimentwise error. Empirical studies demonstrate that standard exploratory pattern discovery techniques can discover numerous spurious patterns when applied to random data and when applied to real-world data result in large numbers of patterns that are rejected when subjected to statistical evaluation on holdout data. They also reveal that modification of the pattern discovery process to anticipate subsequent statistical evaluation can increase the number of patterns that are accepted by statistical evaluation on holdout data.

Zheng, F., & Webb, G. I.
Lecture Notes in Artificial Intelligence 4710: Proceedings of the 18th European Conference on Machine Learning (ECML'07), Berlin/Heidelberg, pp. 490-501, 2007.

@InProceedings{ZhengWebb07,
author = {Zheng, F. and Webb, G. I.},
booktitle = {Lecture Notes in Artificial Intelligence 4710: Proceedings of the 18th European Conference on Machine Learning (ECML'07)},
title = {Finding the Right Family: Parent and Child Selection for Averaged One-Dependence Estimators},
year = {2007},
pages = {490-501},
publisher = {Springer-Verlag},
abstract = {Averaged One-Dependence Estimators (AODE) classifies by uniformly aggregating all qualified one-dependence estimators (ODEs). Its capacity to significantly improve naive Bayes' accuracy without undue time complexity has attracted substantial interest. Forward Sequential Selection and Backwards Sequential Elimination are effective wrapper techniques to identify and repair harmful interdependencies which have been profitably applied to naive Bayes. However, their straightforward application to AODE has previously proved ineffective. We investigate novel variants of these strategies. Our extensive experiments show that elimination of child attributes from within the constituent ODEs results in a significant improvement in probability estimate and reductions in bias and error relative to unmodified AODE. In contrast, elimination of complete constituent ODEs and the four types of attribute addition are found to be less effective and do not demonstrate any strong advantage over AODE. These surprising results lead to effective techniques for improving AODE's prediction accuracy.},
keywords = {Conditional Probability Estimation and AODE},
location = {Warsaw, Poland},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Averaged One-Dependence Estimators (AODE) classifies by uniformly aggregating all qualified one-dependence estimators (ODEs). Its capacity to significantly improve naive Bayes' accuracy without undue time complexity has attracted substantial interest. Forward Sequential Selection and Backwards Sequential Elimination are effective wrapper techniques to identify and repair harmful interdependencies which have been profitably applied to naive Bayes. However, their straightforward application to AODE has previously proved ineffective. We investigate novel variants of these strategies. Our extensive experiments show that elimination of child attributes from within the constituent ODEs results in a significant improvement in probability estimate and reductions in bias and error relative to unmodified AODE. In contrast, elimination of complete constituent ODEs and the four types of attribute addition are found to be less effective and do not demonstrate any strong advantage over AODE. These surprising results lead to effective techniques for improving AODE's prediction accuracy.

Yang, Y., Webb, G. I., Cerquides, J., Korb, K., Boughton, J., & Ting, K-M.
IEEE Transactions on Knowledge and Data Engineering, 19(12), 1652-1665, 2007.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{YangWebbCerquideszKorbBoughtonTing07,
author = {Yang, Y. and Webb, G. I. and Cerquides, J. and Korb, K. and Boughton, J. and Ting, K-M.},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
title = {To Select or To Weigh: A Comparative Study of Linear Combination Schemes for SuperParent-One-Dependence Estimators},
year = {2007},
number = {12},
pages = {1652-1665},
volume = {19},
abstract = {We conduct a large-scale comparative study on linearly combining superparent-one-dependence estimators (SPODEs), a popular family of semi-naive Bayesian classifiers. Altogether 16 model selection and weighing schemes, 58 benchmark data sets, as well as various statistical tests are employed. This paper�s main contributions are three-fold. First, it formally presents each scheme�s definition, rationale and time complexity; and hence can serve as a comprehensive reference for researchers interested in ensemble learning. Second, it offers bias-variance analysis for each scheme�s classification error performance. Third, it identifies effective schemes that meet various needs in practice. This leads to accurate and fast classification algorithms with immediate and significant impact on real-world applications. Another important feature of our study is using a variety of statistical tests to evaluate multiple learning methods across multiple data sets.},
doi = {10.1109/TKDE.2007.190650},
keywords = {Conditional Probability Estimation and AODE},
publisher = {{IEEE} Computer Society},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT We conduct a large-scale comparative study on linearly combining superparent-one-dependence estimators (SPODEs), a popular family of semi-naive Bayesian classifiers. Altogether 16 model selection and weighing schemes, 58 benchmark data sets, as well as various statistical tests are employed. This paper�s main contributions are three-fold. First, it formally presents each scheme�s definition, rationale and time complexity; and hence can serve as a comprehensive reference for researchers interested in ensemble learning. Second, it offers bias-variance analysis for each scheme�s classification error performance. Third, it identifies effective schemes that meet various needs in practice. This leads to accurate and fast classification algorithms with immediate and significant impact on real-world applications. Another important feature of our study is using a variety of statistical tests to evaluate multiple learning methods across multiple data sets.

Finding the Real Patterns (Extended Abstract).
Webb, G. I.
Lecture Notes in Computer Science Vol. 4426 : Advances in Knowledge Discovery and Data Mining Proceedings of the 11th Pacific-Asia Conference, PAKDD 2007, Berlin/Heidelberg, pp. 6, 2007.
[Bibtex]

@InProceedings{Webb07a,
author = {Webb, G. I.},
booktitle = {Lecture Notes in Computer Science Vol. 4426 : Advances in Knowledge Discovery and Data Mining Proceedings of the 11th {Pacific}-{Asia} Conference, PAKDD 2007},
title = {Finding the Real Patterns (Extended Abstract)},
year = {2007},
editor = {Zhou, Zhi-Hua and Li, Hang and Yang, Qiang},
pages = {6},
publisher = {Springer},
keywords = {opus},
location = {Nanjing, China},
}
ABSTRACT 

Yang, Y., Webb, G. I., Korb, K., & Ting, K-M.
Machine Learning, 69(1), 35-53, 2007.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{YangWebbKorbTing07,
author = {Yang, Y. and Webb, G. I. and Korb, K. and Ting, K-M.},
journal = {Machine Learning},
title = {Classifying under Computational Resource Constraints: Anytime Classification Using Probabilistic Estimators},
year = {2007},
number = {1},
pages = {35-53},
volume = {69},
abstract = {In many online applications of machine learning, the computational resources available for classification will vary from time to time. Most techniques are designed to operate within the constraints of the minimum expected resources and fail to utilize further resources when they are available. We propose a novel anytime classification algorithm, anytime averaged probabilistic estimators (AAPE), which is capable of delivering strong prediction accuracy with little CPU time and utilizing additional CPU time to increase classification accuracy. The idea is to run an ordered sequence of very efficient Bayesian probabilistic estimators (single improvement steps) until classification time runs out. Theoretical studies and empirical validations reveal that by properly identifying, ordering, invoking and ensembling single improvement steps, AAPE is able to accomplish accurate classification whenever it is interrupted. It is also able to output class probability estimates beyond simple 0/1-loss classifications, as well as adeptly handle incremental learning.},
audit-trail = {DOI 10.1007/s10994-007-5020-z},
doi = {10.1007/s10994-007-5020-z},
keywords = {Conditional Probability Estimation and AODE},
publisher = {Springer},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT In many online applications of machine learning, the computational resources available for classification will vary from time to time. Most techniques are designed to operate within the constraints of the minimum expected resources and fail to utilize further resources when they are available. We propose a novel anytime classification algorithm, anytime averaged probabilistic estimators (AAPE), which is capable of delivering strong prediction accuracy with little CPU time and utilizing additional CPU time to increase classification accuracy. The idea is to run an ordered sequence of very efficient Bayesian probabilistic estimators (single improvement steps) until classification time runs out. Theoretical studies and empirical validations reveal that by properly identifying, ordering, invoking and ensembling single improvement steps, AAPE is able to accomplish accurate classification whenever it is interrupted. It is also able to output class probability estimates beyond simple 0/1-loss classifications, as well as adeptly handle incremental learning.

Zheng, F., & Webb, G. I.
ACM International Conference Proceeding Series, Vol. 148: The Proceedings of the Twenty-third International Conference on Machine Learning (ICML'06), New York, NY, pp. 1113-1120, 2006.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ZhengWebb06,
author = {Zheng, F. and Webb, G. I.},
booktitle = {ACM International Conference Proceeding Series, Vol. 148: The Proceedings of the Twenty-third International Conference on Machine Learning (ICML'06)},
title = {Efficient Lazy Elimination for Averaged One-Dependence Estimators},
year = {2006},
editor = {W. Cohen and A. Moore},
pages = {1113-1120},
publisher = {ACM Press},
abstract = {Semi-naive Bayesian classifiers seek to retain the numerous strengths of naive Bayes while reducing error by weakening the attribute independence assumption. Backwards Sequential Elimination (BSE) is a wrapper technique for attribute elimination that has proved effective at this task. We explore a new efficient technique, Lazy Elimination (LE), which eliminates highly related attribute-values at classification time without the computational overheads inherent in wrapper techniques. We analyze the effect of LE and BSE on Averaged One-Dependence Estimators (AODE), a state-of-the-art semi-naive Bayesian algorithm. Our extensive experiments show that LE significantly reduces bias and error without undue additional computation, while BSE significantly reduces bias but not error, with high training time complexity. In the context of AODE, LE has a significant advantage over BSE in both computational efficiency and error.},
audit-trail = {ISBN:1-59593-383-2, DOI http://doi.acm.org/10.1145/1143844.1143984},
keywords = {Conditional Probability Estimation and AODE},
location = {Pittsburgh, Pennsylvania},
related = {learning-complex-conditional-probabilities-from-data},
url = {http://dl.acm.org/authorize?N00547},
}
ABSTRACT Semi-naive Bayesian classifiers seek to retain the numerous strengths of naive Bayes while reducing error by weakening the attribute independence assumption. Backwards Sequential Elimination (BSE) is a wrapper technique for attribute elimination that has proved effective at this task. We explore a new efficient technique, Lazy Elimination (LE), which eliminates highly related attribute-values at classification time without the computational overheads inherent in wrapper techniques. We analyze the effect of LE and BSE on Averaged One-Dependence Estimators (AODE), a state-of-the-art semi-naive Bayesian algorithm. Our extensive experiments show that LE significantly reduces bias and error without undue additional computation, while BSE significantly reduces bias but not error, with high training time complexity. In the context of AODE, LE has a significant advantage over BSE in both computational efficiency and error.

Webb, G. I.
Advances in Intelligent IT: Proceedings of the Fourth International Conference on Active Media Technology (AMT'06). [Extended Abstract], Amsterdam, pp. 7-12, 2006.

@InProceedings{Webb06b,
author = {Webb, G. I.},
booktitle = {Advances in Intelligent IT: Proceedings of the Fourth International Conference on Active Media Technology (AMT'06). [Extended Abstract]},
title = {Anytime Learning and Classification for Online Applications},
year = {2006},
editor = {Li, Y. and Looi, M. and Zhong, N.},
pages = {7-12},
publisher = {IOS Press},
abstract = {Many online applications of machine learning require fast classification and hence utilise efficient classifiers such as naive Bayes. However, outside periods of peak computation load, additional computational resources will often be available. Anytime classification can use whatever computational resources may be available at classification time to improve the accuracy of the classifications made.},
audit-trail = {ISSN 0922-6389},
location = {Brisbane, Australia},
}
ABSTRACT Many online applications of machine learning require fast classification and hence utilise efficient classifiers such as naive Bayes. However, outside periods of peak computation load, additional computational resources will often be available. Anytime classification can use whatever computational resources may be available at classification time to improve the accuracy of the classifications made.

Webb, G. I.
Proceedings of the Twelfth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2006), New York, pp. 434-443, 2006.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{Webb06a,
author = {Webb, G. I.},
booktitle = {Proceedings of the Twelfth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2006)},
title = {Discovering Significant Rules},
year = {2006},
editor = {Ungar, L. and Craven, M. and Gunopulos, D. and Eliassi-Rad, T.},
pages = {434-443},
publisher = {The Association for Computing Machinery},
abstract = {In many applications, association rules will only be interesting if they represent non-trivial correlations between all constituent items. Numerous techniques have been developed that seek to avoid false discoveries. However, while all provide useful solutions to aspects of this problem, none provides a generic solution that is both flexible enough to accommodate varying definitions of true and false discoveries and powerful enough to provide strict control over the risk of false discoveries. This paper presents generic techniques that allow definitions of true and false discoveries to be specified in terms of arbitrary statistical hypothesis tests and which provide strict control over the experimentwise risk of false discoveries.},
keywords = {OPUS and Association Rule Discovery and statistically sound discovery},
related = {statistically-sound-association-discovery},
url = {http://dl.acm.org/authorize?N00546},
}
ABSTRACT In many applications, association rules will only be interesting if they represent non-trivial correlations between all constituent items. Numerous techniques have been developed that seek to avoid false discoveries. However, while all provide useful solutions to aspects of this problem, none provides a generic solution that is both flexible enough to accommodate varying definitions of true and false discoveries and powerful enough to provide strict control over the risk of false discoveries. This paper presents generic techniques that allow definitions of true and false discoveries to be specified in terms of arbitrary statistical hypothesis tests and which provide strict control over the experimentwise risk of false discoveries.

Yang, Y., & Webb, G. I.
In Wang, J. (Ed.), In The Encyclopedia of Data Warehousing and Mining (, pp. 392-396). Hershey, PA: Idea Group Inc., 2006.
[Bibtex]  → Access on publisher site

@InCollection{YangWebb05,
Title = {Discretization for Data Mining},
Author = {Yang, Y. and Webb, G. I.},
Booktitle = {The Encyclopedia of Data Warehousing and Mining},
Publisher = {Idea Group Inc.},
Year = {2006},
Editor = {Wang, John},
Pages = {392-396},
Audit-trail = {August 04 Copyright signed. Ying handling submission. PDF not posted},
Doi = {10.4018/978-1-59140-557-3.ch075}
}
ABSTRACT 

Lecture Notes in Artificial Intelligence 4099: Proceedings of the 9th Pacific Rim International Conference on Artificial Intelligence (PRICAI 2006)
Yang, Q., & Webb, G. I. (Ed).
Berlin: Springer, 2006.
[Bibtex]

@Proceedings{YangWebb06,
Title = {Lecture Notes in Artificial Intelligence 4099: Proceedings of the 9th {Pacific} Rim International Conference on Artificial Intelligence (PRICAI 2006)},
Year = {2006},
Editor = {Yang, Q. and Webb, G. I.},
Publisher = {Springer},
Series = {Lecture Notes in Artificial Intelligence},
Location = {Guilin, China}
}
ABSTRACT 

Lu, J., Yang, Y., & Webb, G. I.
Lecture Notes in Computer Science 4093: Proceedings of the Second International Conference on Advanced Data Mining and Applications (ADMA 2006), Berlin, pp. 223-238, 2006.

@InProceedings{LuYangWebb06,
author = {Lu, J. and Yang, Y. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 4093: Proceedings of the Second International Conference on Advanced Data Mining and Applications (ADMA 2006)},
title = {Incremental Discretization for Naive-Bayes Classifier},
year = {2006},
editor = {Li, Xue and Zaiane, Osmar R. and Li, Zhanhuai},
pages = {223-238},
publisher = {Springer},
abstract = {Naive-Bayes classifiers (NB) support incremental learning. However, the lack of effective incremental discretization methods has been hindering NB's incremental learning in face of quantitative data. This problem is further compounded by the fact that quantitative data are everywhere, from temperature readings to share prices. In this paper, we present a novel incremental discretization method for NB, incremental flexible frequency discretization (IFFD). IFFD discretizes values of a quantitative attribute into a sequence of intervals of flexible sizes. It allows online insertion and splitting operation on intervals. Theoretical analysis and experimental test are conducted to compare IFFD with alternative methods. Empirical evidence suggests that IFFD is efficient and effective. NB coupled with IFFD achieves a rapport between high learning efficiency and high classification accuracy in the context of incremental learning.},
keywords = {Conditional Probability Estimation and Discretization for Naive Bayes and Incremental Learning and Stream Mining},
location = {Xi�an, China},
related = {discretization-for-naive-bayes},
}
ABSTRACT Naive-Bayes classifiers (NB) support incremental learning. However, the lack of effective incremental discretization methods has been hindering NB's incremental learning in face of quantitative data. This problem is further compounded by the fact that quantitative data are everywhere, from temperature readings to share prices. In this paper, we present a novel incremental discretization method for NB, incremental flexible frequency discretization (IFFD). IFFD discretizes values of a quantitative attribute into a sequence of intervals of flexible sizes. It allows online insertion and splitting operation on intervals. Theoretical analysis and experimental test are conducted to compare IFFD with alternative methods. Empirical evidence suggests that IFFD is efficient and effective. NB coupled with IFFD achieves a rapport between high learning efficiency and high classification accuracy in the context of incremental learning.

Butler, S., & Webb, G. I.
In Wang, J. (Ed.), In The Encyclopedia of Data Warehousing and Mining (, pp. 795-799). Hershey, PA: Idea Group Inc., 2006.
[Bibtex]  → Access on publisher site

@InCollection{ButlerWebb05,
Title = {Mining Group Differences},
Author = {Butler, S. and Webb, G. I.},
Booktitle = {The Encyclopedia of Data Warehousing and Mining},
Publisher = {Idea Group Inc.},
Year = {2006},
Editor = {Wang, John},
Pages = {795-799},
Audit-trail = {August 04 Copyright signed. Shane handling submission. PDF not posted},
Doi = {10.4018/978-1-60566-010-3.ch199},
Keywords = {Association Rule Discovery}
}
ABSTRACT 

Webb, G. I., & Brain, D.
LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications', Berlin/Heidelberg, pp. 1-13, 2006.

@InProceedings{WebbBrain05,
author = {Webb, G. I. and Brain, D.},
booktitle = {LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications'},
title = {Generality is Predictive of Prediction Accuracy},
year = {2006},
note = {An earlier version of this paper was published in the Proceedings of PKAW 2002, pp 117-130},
pages = {1-13},
publisher = {Springer},
abstract = {During knowledge acquisition it frequently occurs that multiple alternative potential rules all appear equally credible. This paper addresses the dearth of formal analysis about how to select between such alternatives. It presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. We argue that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. We also argue that in comparison to the more general rule, the accuracy of the more specific rule on unseen cases will tend to be closer to the accuracy obtained on training data. Experimental evidence is provided in support of these hypotheses. These hypotheses can be useful for selecting between rules in order to achieve specific knowledge acquisition objectives.},
keywords = {Generality},
related = {generality-is-predictive-of-prediction-accuracy},
}
ABSTRACT During knowledge acquisition it frequently occurs that multiple alternative potential rules all appear equally credible. This paper addresses the dearth of formal analysis about how to select between such alternatives. It presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. We argue that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. We also argue that in comparison to the more general rule, the accuracy of the more specific rule on unseen cases will tend to be closer to the accuracy obtained on training data. Experimental evidence is provided in support of these hypotheses. These hypotheses can be useful for selecting between rules in order to achieve specific knowledge acquisition objectives.

Huang, S., & Webb, G. I.
LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications', Berlin/Heidelberg, pp. 64-77, 2006.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{HuangWebb05b,
author = {Huang, S. and Webb, G. I.},
booktitle = {LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications'},
title = {Efficiently Identifying Exploratory Rules' Significance},
year = {2006},
note = {An earlier version of this paper was published in S.J. Simoff and G.J. Williams (Eds.), Proceedings of the Third Australasian Data Mining Conference (AusDM04) Cairns, Australia. Sydney: University of Technology, pages 169-182.},
pages = {64-77},
publisher = {Springer},
abstract = {How to efficiently discard potentially uninteresting rules in exploratory rule discovery is one of the important research foci in data mining. Many researchers have presented algorithms to automatically remove potentially uninteresting rules utilizing background knowledge and user-specified constraints. Identifying the significance of exploratory rules using a significance test is desirable for removing rules that may appear interesting by chance, hence providing the users with a more compact set of resulting rules. However, applying statistical tests to identify significant rules requires considerable computation and data access in order to obtain the necessary statistics. The situation gets worse as the size of the database increases. In this paper, we propose two approaches for improving the efficiency of significant exploratory rule discovery. We also evaluate the experimental effect in impact rule discovery which is suitable for discovering exploratory rules in very large, dense databases.},
doi = {10.1007/11677437_6},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS and Impact Rules},
related = {statistically-sound-association-discovery},
}
ABSTRACT How to efficiently discard potentially uninteresting rules in exploratory rule discovery is one of the important research foci in data mining. Many researchers have presented algorithms to automatically remove potentially uninteresting rules utilizing background knowledge and user-specified constraints. Identifying the significance of exploratory rules using a significance test is desirable for removing rules that may appear interesting by chance, hence providing the users with a more compact set of resulting rules. However, applying statistical tests to identify significant rules requires considerable computation and data access in order to obtain the necessary statistics. The situation gets worse as the size of the database increases. In this paper, we propose two approaches for improving the efficiency of significant exploratory rule discovery. We also evaluate the experimental effect in impact rule discovery which is suitable for discovering exploratory rules in very large, dense databases.

Yang, Y., Webb, G. I., Cerquides, J., Korb, K., Boughton, J., & Ting, K-M.
Lecture Notes in Computer Science 4212: Proceedings of the 17th European Conference on Machine Learning (ECML'06), Berlin/Heidelberg, pp. 533-544, 2006.

@InProceedings{YangWebbCerquideKorbBoughtonTing06,
author = {Yang, Y. and Webb, G. I. and Cerquides, J. and Korb, K. and Boughton, J. and Ting, K-M.},
booktitle = {Lecture Notes in Computer Science 4212: Proceedings of the 17th European Conference on Machine Learning (ECML'06)},
title = {To Select or To Weigh: A Comparative Study of Model Selection and Model Weighing for SPODE Ensembles},
year = {2006},
editor = {Furkranz, J. and Scheffer, T. and Spiliopoulou, M.},
pages = {533-544},
publisher = {Springer-Verlag},
abstract = {An ensemble of Super-Parent-One-Dependence Estimators (SPODEs) offers a powerful yet simple alternative to naive Bayes classifiers, achieving significantly higher classification accuracy at a moderate cost in classification efficiency. Currently there exist two families of methodologies that ensemble candidate SPODEs for classification. One is to select only helpful SPODEs and uniformly average their probability estimates, a type of model selection. Another is to assign a weight to each SPODE and linearly combine their probability estimates, a methodology named model weighing. This paper presents a theoretical and empirical study comparing model selection and model weighing for ensembling SPODEs. The focus is on maximizing the ensemble's classification accuracy while minimizing its computational time. A number of representative selection and weighing schemes are studied, providing a comprehensive research on this topic and identifying effective schemes that provide alternative trades-offs between speed and expected error},
keywords = {Conditional Probability Estimation and AODE},
location = {Berlin, Germany},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT An ensemble of Super-Parent-One-Dependence Estimators (SPODEs) offers a powerful yet simple alternative to naive Bayes classifiers, achieving significantly higher classification accuracy at a moderate cost in classification efficiency. Currently there exist two families of methodologies that ensemble candidate SPODEs for classification. One is to select only helpful SPODEs and uniformly average their probability estimates, a type of model selection. Another is to assign a weight to each SPODE and linearly combine their probability estimates, a methodology named model weighing. This paper presents a theoretical and empirical study comparing model selection and model weighing for ensembling SPODEs. The focus is on maximizing the ensemble's classification accuracy while minimizing its computational time. A number of representative selection and weighing schemes are studied, providing a comprehensive research on this topic and identifying effective schemes that provide alternative trades-offs between speed and expected error

Webb, G. I., & Ting, K. M.
Machine Learning, 58(1), 25-32, 2005.

@Article{WebbTing05,
Title = {On the Application of ROC Analysis to Predict Classification Performance Under Varying Class Distributions},
Author = {Webb, G. I. and Ting, K.M.},
Journal = {Machine Learning},
Year = {2005},
Number = {1},
Pages = {25-32},
Volume = {58},
Abstract = {We counsel caution in the application of ROC analysis for prediction of classifier accuracy under varying class distributions. The heart of our contention is that in real-world applications variations of class distribution are likely to result from forces that affect the distribution of the attribute-values, rather than forces that directly affect the class distribution. In statistical terms, it is usually the class, rather than the attributes, that is the dependent variable. If the class distribution alters as an indirect consequence of changes in the distribution of the attribute values, rather than vice versa, performance estimates derived through ROC analysis may be grossly inaccurate.},
Audit-trail = {22/4 Preprint pdf posted},
Publisher = {Springer}
}
ABSTRACT We counsel caution in the application of ROC analysis for prediction of classifier accuracy under varying class distributions. The heart of our contention is that in real-world applications variations of class distribution are likely to result from forces that affect the distribution of the attribute-values, rather than forces that directly affect the class distribution. In statistical terms, it is usually the class, rather than the attributes, that is the dependent variable. If the class distribution alters as an indirect consequence of changes in the distribution of the attribute values, rather than vice versa, performance estimates derived through ROC analysis may be grossly inaccurate.

Yang, Y., Webb, G. I., & Wu, X.
In Maimon, O., & Rokach, L. (Eds.), In The Data Mining and Knowledge Discovery Handbook (, pp. 113-130). Berlin: Springer, 2005.
[Bibtex]  → Access on publisher site

@InCollection{YangWebbWu05,
Title = {Discretization Methods},
Author = {Yang, Y. and Webb, G. I. and Wu, X.},
Booktitle = {The Data Mining and Knowledge Discovery Handbook},
Publisher = {Springer},
Year = {2005},
Editor = {Maimon, O. and Rokach, L.},
Pages = {113-130},
Doi = {10.1007/978-0-387-09823-4_6}
}
ABSTRACT 

Webb, G. I., Boughton, J., & Wang, Z.
Machine Learning, 58(1), 5-24, 2005.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WebbBoughtonWang05,
Title = {Not So Naive {Bayes}: Aggregating One-Dependence Estimators},
Author = {Webb, G. I. and Boughton, J. and Wang, Z.},
Journal = {Machine Learning},
Year = {2005},
Number = {1},
Pages = {5-24},
Volume = {58},
Abstract = {Of numerous proposals to improve the accuracy of naive Bayes by weakening its attribute independence assumption, both LBR and TAN have demonstrated remarkable error performance. However, both techniques obtain this outcome at a considerable computational cost. We present a new approach to weakening the attribute independence assumption by averaging all of a constrained class of classifiers. In extensive experiments this technique delivers comparable prediction accuracy to LBR and TAN with substantially improved computational efficiency.},
Audit-trail = {3/5/04 Pre-print posted},
Doi = {10.1007/s10994-005-4258-6},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Of numerous proposals to improve the accuracy of naive Bayes by weakening its attribute independence assumption, both LBR and TAN have demonstrated remarkable error performance. However, both techniques obtain this outcome at a considerable computational cost. We present a new approach to weakening the attribute independence assumption by averaging all of a constrained class of classifiers. In extensive experiments this technique delivers comparable prediction accuracy to LBR and TAN with substantially improved computational efficiency.

Webb, G. I., & Zhang, S.
Data Mining and Knowledge Discovery, 10(1), 39-79, 2005.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{WebbZhang05,
Title = {k-Optimal-Rule-Discovery},
Author = {Webb, G. I. and Zhang, S.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2005},
Number = {1},
Pages = {39-79},
Volume = {10},
Abstract = {K-most-interesting rule discovery finds the k rules that optimize a user-specified measure of interestingness with respect to a set of sample data and user-specified constraints. This approach avoids many limitations of the frequent itemset approach of association rule discovery. This paper presents a scalable algorithm applicable to a wide range of k-most-interesting rule discovery tasks and demonstrates its efficiency.},
Doi = {10.1007/s10618-005-0255-4},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Publisher = {Springer},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT K-most-interesting rule discovery finds the k rules that optimize a user-specified measure of interestingness with respect to a set of sample data and user-specified constraints. This approach avoids many limitations of the frequent itemset approach of association rule discovery. This paper presents a scalable algorithm applicable to a wide range of k-most-interesting rule discovery tasks and demonstrates its efficiency.

Zheng, F., & I., W. G.
Proceedings of the Fourth Australasian Data Mining Conference (AusDM05), Sydney, pp. 141-156, 2005.

@InProceedings{ZhengWebb05,
author = {Zheng, F. and Webb. G. I.},
booktitle = {Proceedings of the Fourth Australasian Data Mining Conference (AusDM05)},
title = {A Comparative Study of Semi-naive Bayes Methods in Classification Learning},
year = {2005},
editor = {Simoff, S.J. and Williams, G.J. and Galloway, J. and Kolyshkina, I.},
pages = {141-156},
publisher = {University of Technology},
abstract = {Numerous techniques have sought to improve the accuracy of Naive Bayes (NB) by alleviating the attribute interdependence problem. This paper summarizes these semi-naive Bayesian methods into two groups: those that apply conventional NB with a new attribute set, and those that alter NB by allowing inter-dependencies between attributes. We review eight typical semi-naive Bayesian learning algorithms and perform error analysis using the bias-variance decomposition on thirty-six natural domains from the UCI Machine Learning Repository. In analysing the results of these experiments we provide general recommendations for selection between methods.},
keywords = {AODE and Conditional Probability Estimation},
location = {Sydney, Australia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Numerous techniques have sought to improve the accuracy of Naive Bayes (NB) by alleviating the attribute interdependence problem. This paper summarizes these semi-naive Bayesian methods into two groups: those that apply conventional NB with a new attribute set, and those that alter NB by allowing inter-dependencies between attributes. We review eight typical semi-naive Bayesian learning algorithms and perform error analysis using the bias-variance decomposition on thirty-six natural domains from the UCI Machine Learning Repository. In analysing the results of these experiments we provide general recommendations for selection between methods.

Siu, K. K. W., Butler, S. M., Beveridge, T., Gillam, J. E., Hall, C. J., Kaye, A. H., Lewis, R. A., Mannan, K., McLoughlin, G., Pearson, S., Round, A. R., E., S., Webb, G. I., & Wilkinson, S. J.
Nuclear Instruments and Methods in Physics Research A, 548, 140-146, 2005.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{SiuEtAl05,
author = {Siu, K. K. W. and Butler, S. M. and Beveridge, T. and Gillam, J. E. and Hall, C. J. and Kaye, A. H. and Lewis, R. A. and Mannan, K. and McLoughlin, G. and Pearson, S. and Round, A. R. and Schultke E. and Webb, G. I. and Wilkinson, S. J.},
journal = {Nuclear Instruments and Methods in Physics Research A},
title = {Identifying markers of pathology in SAXS data of malignant tissues of the brain},
year = {2005},
pages = {140-146},
volume = {548},
abstract = {Conventional neuropathological analysis for brain malignancies is heavily reliant on the observation of morphological abnormalities, observed in thin, stained sections of tissue. Small Angle X-ray Scattering (SAXS) data provide an alternative means of distinguishing pathology by examining the ultra-structural (nanometer length scales) characteristics of tissue. To evaluate the diagnostic potential of SAXS for brain tumors, data was collected from normal, malignant and benign tissues of the human brain at station 2.1 of the Daresbury Laboratory Synchrotron Radiation Source and subjected to data mining and multivariate statistical analysis. The results suggest SAXS data may be an effective classi.er of malignancy.},
doi = {10.1016/j.nima.2005.03.081},
keywords = {health},
publisher = {Elsevier},
related = {health},
}
ABSTRACT Conventional neuropathological analysis for brain malignancies is heavily reliant on the observation of morphological abnormalities, observed in thin, stained sections of tissue. Small Angle X-ray Scattering (SAXS) data provide an alternative means of distinguishing pathology by examining the ultra-structural (nanometer length scales) characteristics of tissue. To evaluate the diagnostic potential of SAXS for brain tumors, data was collected from normal, malignant and benign tissues of the human brain at station 2.1 of the Daresbury Laboratory Synchrotron Radiation Source and subjected to data mining and multivariate statistical analysis. The results suggest SAXS data may be an effective classi.er of malignancy.

Yang, Y., Korb, K., Ting, K-M., & Webb, G. I.
Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005), Berlin/Heidelberg, pp. 102-111, 2005.

@InProceedings{YangKorbTingWebb05,
author = {Yang, Y. and Korb, K. and Ting, K-M. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)},
title = {Ensemble Selection for SuperParent-One-Dependence Estimators},
year = {2005},
editor = {Zhang, S. and Jarvis, R.},
pages = {102-111},
publisher = {Springer},
abstract = {SuperParent-One-Dependence Estimators (SPODEs) loosen Naive-Bayes' attribute independence assumption by allowing each attribute to depend on a common single attribute (superparent) in addition to the class. An ensemble of SPODEs is able to achieve high classification accuracy with modest computational cost. This paper investigates how to select SPODEs for ensembling. Various popular model selection strategies are presented. Their learning efficacy and efficiency are theoretically analyzed and empirically verified. Accordingly, guidelines are investigated for choosing between selection criteria in differing contexts.},
audit-trail = {http://dx.doi.org/10.1007/11589990_13},
keywords = {Conditional Probablity Estimation and AODE},
location = {Sydney, Australia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT SuperParent-One-Dependence Estimators (SPODEs) loosen Naive-Bayes' attribute independence assumption by allowing each attribute to depend on a common single attribute (superparent) in addition to the class. An ensemble of SPODEs is able to achieve high classification accuracy with modest computational cost. This paper investigates how to select SPODEs for ensembling. Various popular model selection strategies are presented. Their learning efficacy and efficiency are theoretically analyzed and empirically verified. Accordingly, guidelines are investigated for choosing between selection criteria in differing contexts.

Webb, G. I.
Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)[Extended Abstract], Berlin/Heidelberg, pp. 1-2, 2005.

@InProceedings{Webb05a,
author = {Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)[Extended Abstract]},
title = {K-Optimal Pattern Discovery: An Efficient and Effective Approach to Exploratory Data Mining},
year = {2005},
editor = {Zhang, S. and Jarvis, R.},
pages = {1-2},
publisher = {Springer},
audit-trail = {http://dx.doi.org/10.1007/11589990_1},
keywords = {Association Rule Discovery},
location = {Sydney, Australia},
}
ABSTRACT 

Huang, S., & Webb, G. I.
Proceedings of the Fifth SIAM International Conference on Data Mining (SDM'05) [short paper], Philadelphia, PA, pp. 541-545, 2005.

@InProceedings{HuangWebb05,
author = {Huang, S. and Webb, G. I.},
booktitle = {Proceedings of the Fifth {SIAM} International Conference on Data Mining ({SDM}'05) [short paper]},
title = {Discarding Insignificant Rules During Impact Rule Discovery in Large, Dense Databases},
year = {2005},
editor = {Kargupta, H. and Kamath, C. and Srivastava, J. and Goodman, A.},
pages = {541-545},
publisher = {Society for Industrial and Applied Mathematics},
abstract = {Considerable progress has been made on how to reduce the number of spurious exploratory rules with quantitative attributes. However, little has been done for rules with undiscretized quantitative attributes. It is argued that propositional rules can not effectively describe the interactions between quantitative and qualitative attributes. Aumann and Lindell proposed quantitative association rules to provide a better description of such relationship, together with a rule pruning techniques . Since their technique is based on the frequent itemset framework, it is not suitable for rule discovery in large, dense databases. In this paper, an efficient technique for automatically discarding insignificant rules during rule discovery is proposed, based on the OPUS search algorithm. Experiments demonstrate that the algorithm we propose can efficiently remove potentially uninteresting rules even in very large, dense databases.},
audit-trail = {Shiying travelling to present paper. Requested permission to post pdf 10/2},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS and Impact Rules},
location = {Newport Beach, CA},
related = {impact-rules},
}
ABSTRACT Considerable progress has been made on how to reduce the number of spurious exploratory rules with quantitative attributes. However, little has been done for rules with undiscretized quantitative attributes. It is argued that propositional rules can not effectively describe the interactions between quantitative and qualitative attributes. Aumann and Lindell proposed quantitative association rules to provide a better description of such relationship, together with a rule pruning techniques . Since their technique is based on the frequent itemset framework, it is not suitable for rule discovery in large, dense databases. In this paper, an efficient technique for automatically discarding insignificant rules during rule discovery is proposed, based on the OPUS search algorithm. Experiments demonstrate that the algorithm we propose can efficiently remove potentially uninteresting rules even in very large, dense databases.

Huang, S., & Webb, G. I.
Lecture Notes in Computer Science Vol. 3518: Proceedings of the 9th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2005), Berlin/Heidelberg, pp. 71-80, 2005.

@InProceedings{HuangWebb05a,
author = {Huang, S. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science Vol. 3518: Proceedings of the 9th {Pacific}-{Asia} Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2005)},
title = {Pruning Derivative Partial Rules During Impact Rule Discovery},
year = {2005},
editor = {Ho, T.B. and Cheung, D. and Liu, H.},
pages = {71-80},
publisher = {Springer},
abstract = {Because exploratory rule discovery works with data that is only a sample of the phenomena to be investigated, some resulting rules may appear interesting only by chance. Techniques are developed for automatically discarding statistically insignificant exploratory rules that cannot survive a hypothesis with regard to its ancestors. We call such insignificant rules derivative extended rules. In this paper, we argue that there is another type of derivative exploratory rules, which is derivative with regard to their children. We also argue that considerable amount of such derivative partial rules can not be successfully removed using existing rule pruning techniques. We propose a new technique to address this problem. Experiments are done in impact rule discovery to evaluate the effect of this derivative partial rule filter. Results show that the inherent problem of too many resulting rules in exploratory rule discovery is alleviated.},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS and Impact Rules},
location = {Hanoi, Vietnam},
related = {impact-rules},
}
ABSTRACT Because exploratory rule discovery works with data that is only a sample of the phenomena to be investigated, some resulting rules may appear interesting only by chance. Techniques are developed for automatically discarding statistically insignificant exploratory rules that cannot survive a hypothesis with regard to its ancestors. We call such insignificant rules derivative extended rules. In this paper, we argue that there is another type of derivative exploratory rules, which is derivative with regard to their children. We also argue that considerable amount of such derivative partial rules can not be successfully removed using existing rule pruning techniques. We propose a new technique to address this problem. Experiments are done in impact rule discovery to evaluate the effect of this derivative partial rule filter. Results show that the inherent problem of too many resulting rules in exploratory rule discovery is alleviated.

Thiruvady, D. R., & Webb, G. I.
Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 04) [Short Paper], Berlin/Heidelberg, pp. 161-165, 2004.

@InProceedings{ThiruvadyWebb04,
Title = {Mining Negative Rules using GRD},
Author = {Thiruvady, D. R. and Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD 04) [Short Paper]},
Year = {2004},
Editor = {Dai, H. and Srikant, R. and Zhang, C.},
Pages = {161-165},
Publisher = {Springer},
Abstract = {GRD is an algorithm for k-most interesting rule discovery. In contrast to association rule discovery, GRD does not require the use of a minimum support constraint. Rather, the user must specify a measure of interestingness and the number of rules sought (k). This paper reports efficient techniques to extend GRD to support mining of negative rules. We demonstrate that the new approach provides tractable discovery of both negative and positive rules.},
Audit-trail = {PDF posted 23/8},
Keywords = {association rule discovery and OPUS},
Location = {Sydney, Australia},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT GRD is an algorithm for k-most interesting rule discovery. In contrast to association rule discovery, GRD does not require the use of a minimum support constraint. Rather, the user must specify a measure of interestingness and the number of rules sought (k). This paper reports efficient techniques to extend GRD to support mining of negative rules. We demonstrate that the new approach provides tractable discovery of both negative and positive rules.

Wang, Z., Webb, G. I., & Zheng, F.
Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 04), Berlin/Heidelberg, pp. 319-328, 2004.

@InProceedings{WangWebbZheng04,
author = {Wang, Z. and Webb, G. I. and Zheng, F.},
booktitle = {Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD 04)},
title = {Selective Augmented Bayesian Network Classifiers Based on Rough Set Theory},
year = {2004},
editor = {Dai, H. and Srikant, R. and Zhang, C.},
pages = {319-328},
publisher = {Springer},
abstract = {The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. TAN, Tree-Augmented Naive Bayes, is a state-of-the-art extension of naive Bayes, that can express limited forms of inter-dependence among attributes. Rough sets theory provides tools for expressing inexact or partial dependencies within dataset. In this paper, we present a variant of TAN and compare their tree classifier structures, which can be thought of as a selective restricted trees Bayesian classifier. It delivers lower error than both pre-existing state-of-the-art TAN-based classifiers, with substantially less computation than is required by the SuperParent approach.},
audit-trail = {PDF posted 23/8},
keywords = {Conditional Probability Estimation and AODE and Learning from large datasets},
location = {Sydney, Australia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. TAN, Tree-Augmented Naive Bayes, is a state-of-the-art extension of naive Bayes, that can express limited forms of inter-dependence among attributes. Rough sets theory provides tools for expressing inexact or partial dependencies within dataset. In this paper, we present a variant of TAN and compare their tree classifier structures, which can be thought of as a selective restricted trees Bayesian classifier. It delivers lower error than both pre-existing state-of-the-art TAN-based classifiers, with substantially less computation than is required by the SuperParent approach.

Webb, G. I., & Conilione, P.
(2004). Unpublished manuscript.

@Unpublished{WebbConilione04,
Title = {Estimating bias and variance from data},
Author = {Webb, Geoffrey I and Conilione, Paul},
Note = {Unpublished manuscript},
Year = {2004},
Abstract = {The bias-variance decomposition of error provides useful insights into the error performance of a classifier as it is applied to di#erent types of learning task. Most notably, it has been used to explain the extraordinary e#ectiveness of ensemble learning techniques. It is important that the research community have e#ective tools for assessing such explanations. To this end, techniques have been developed for estimating bias and variance from data. The most widely deployed of these uses repeated sub-sampling with a holdout set. We argue, with empirical support, that this approach has serious limitations. First, it provides very little flexibility in the types of distributions of training sets that may be studied. It requires that the training sets be relatively small and that the degree of variation between training sets be very circumscribed. Second, the approach leads to bias and variance estimates that have high statistical variance and hence low reliability. We develop an alternative method that is based on cross-validation. We show that this method allows far greater flexibility in the types of distribution that are examined and that the estimates derived are much more stable. Finally, we show that changing the distributions of training sets from which bias and variance estimates are drawn can alter substantially the bias and variance estimates that are derived.},
Keywords = {Learning from large datasets and Bias-Variance}
}
ABSTRACT The bias-variance decomposition of error provides useful insights into the error performance of a classifier as it is applied to di#erent types of learning task. Most notably, it has been used to explain the extraordinary e#ectiveness of ensemble learning techniques. It is important that the research community have e#ective tools for assessing such explanations. To this end, techniques have been developed for estimating bias and variance from data. The most widely deployed of these uses repeated sub-sampling with a holdout set. We argue, with empirical support, that this approach has serious limitations. First, it provides very little flexibility in the types of distributions of training sets that may be studied. It requires that the training sets be relatively small and that the degree of variation between training sets be very circumscribed. Second, the approach leads to bias and variance estimates that have high statistical variance and hence low reliability. We develop an alternative method that is based on cross-validation. We show that this method allows far greater flexibility in the types of distribution that are examined and that the estimates derived are much more stable. Finally, we show that changing the distributions of training sets from which bias and variance estimates are drawn can alter substantially the bias and variance estimates that are derived.

Newlands, D. A., & Webb, G. I.
Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV), Southampton, UK, pp. 265-273, 2004.

@InProceedings{NewlandsWebb04a,
author = {Newlands, D. A. and Webb, G. I.},
booktitle = {Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV)},
title = {Alternative Strategies for Decision List Construction},
year = {2004},
editor = {Ebecken, N.F.F.E. and Brebbia, C.A. and Zanasi, A.},
pages = {265-273},
publisher = {WIT Press},
abstract = {This work surveys well-known approaches to building decision lists. Some novel variations to strategies based on default rules for the most common class and insertion of new rules before the default rule are presented. These are expected to offer speed up in the construction of the decision list as well as compression of the length of the list. These strategies and a testing regime have been implemented and some empirical studies done to compare the strategies. Experimental results are presented and interpreted. We show that all strategies deliver decision lists of comparable accuracy. However, two techniques are shown to deliver this accuracy with lists composed of significantly fewer rules than alternative strategies. Of these, one also demonstrates significant computational advantages. The prepending strategy is also demonstrated to produce decision lists which are as much as an order of magnitude shorter than those produced by CN2.},
audit-trail = {Paper posted on web 9/8/04},
keywords = {Prepend},
location = {Rio de Janeiro, Brazil},
related = {prepending},
}
ABSTRACT This work surveys well-known approaches to building decision lists. Some novel variations to strategies based on default rules for the most common class and insertion of new rules before the default rule are presented. These are expected to offer speed up in the construction of the decision list as well as compression of the length of the list. These strategies and a testing regime have been implemented and some empirical studies done to compare the strategies. Experimental results are presented and interpreted. We show that all strategies deliver decision lists of comparable accuracy. However, two techniques are shown to deliver this accuracy with lists composed of significantly fewer rules than alternative strategies. Of these, one also demonstrates significant computational advantages. The prepending strategy is also demonstrated to produce decision lists which are as much as an order of magnitude shorter than those produced by CN2.

Newlands, D. A., & Webb, G. I.
Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV), Southampton, UK, pp. 285-294, 2004.

@InProceedings{NewlandsWebb04,
author = {Newlands, D. A. and Webb, G. I.},
booktitle = {Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV)},
title = {Convex Hulls as an Hypothesis Language Bias},
year = {2004},
editor = {Ebecken, N.F.F.E. and Brebbia, C.A. and Zanasi, A.},
pages = {285-294},
publisher = {WIT Press},
abstract = {Classification learning is dominated by systems which induce large numbers of small axis-orthogonal decision surfaces which biases such systems towards particular hypothesis types. However, there is reason to believe that many domains have underlying concepts which do not involve axis orthogonal surfaces. Further, the multiplicity of small decision regions mitigates against any holistic appreciation of the theories produced by these systems, notwithstanding the fact that many of the small regions are individually comprehensible. We propose the use of less strongly biased hypothesis languages which might be expected to model concepts using a number of structures close to the number of actual structures in the domain. An instantiation of such a language, a convex hull based classifier, CH1, has been implemented to investigate modeling concepts as a small number of large geometric structures in n-dimensional space. A comparison of the number of regions induced is made against other well-known systems on a representative selection of largely or wholly continuous valued machine learning tasks. The convex hull system is shown to produce a number of induced regions about an order of magnitude less than well-known systems and very close to the number of actual concepts. This representation, as convex hulls, allows the possibility of extraction of higher level mathematical descriptions of the induced concepts, using the techniques of computational geometry.},
audit-trail = {Poor quality pdf posted has no ref},
location = {Rio de Janeiro, Brazil},
}
ABSTRACT Classification learning is dominated by systems which induce large numbers of small axis-orthogonal decision surfaces which biases such systems towards particular hypothesis types. However, there is reason to believe that many domains have underlying concepts which do not involve axis orthogonal surfaces. Further, the multiplicity of small decision regions mitigates against any holistic appreciation of the theories produced by these systems, notwithstanding the fact that many of the small regions are individually comprehensible. We propose the use of less strongly biased hypothesis languages which might be expected to model concepts using a number of structures close to the number of actual structures in the domain. An instantiation of such a language, a convex hull based classifier, CH1, has been implemented to investigate modeling concepts as a small number of large geometric structures in n-dimensional space. A comparison of the number of regions induced is made against other well-known systems on a representative selection of largely or wholly continuous valued machine learning tasks. The convex hull system is shown to produce a number of induced regions about an order of magnitude less than well-known systems and very close to the number of actual concepts. This representation, as convex hulls, allows the possibility of extraction of higher level mathematical descriptions of the induced concepts, using the techniques of computational geometry.

Webb, G. I., & Zheng, Z.
IEEE Transactions on Knowledge and Data Engineering, 16(8), 980-991, 2004.

@Article{WebbZheng04,
author = {Webb, G. I. and Zheng, Z.},
journal = {{IEEE} Transactions on Knowledge and Data Engineering},
title = {Multistrategy Ensemble Learning: Reducing Error by Combining Ensemble Learning Techniques},
year = {2004},
number = {8},
pages = {980-991},
volume = {16},
abstract = {Ensemble learning strategies, especially Boosting and Bagging decision trees, have demonstrated impressive capacities to improve the prediction accuracy of base learning algorithms. Further gains have been demonstrated by strategies that combine simple ensemble formation approaches. In this paper, we investigate the hypothesis that the improvement inaccuracy of multi-strategy approaches to ensemble learning is due to an increase in the diversity of ensemble members that are formed. In addition, guided by this hypothesis, we develop three new multi-strategy ensemble-learning techniques. Experimental results in a wide variety of natural domains suggest that these multi-strategy ensemble-learning techniques are, on average, more accurate than their component ensemble learning techniques},
audit-trail = {Due for publication approx July 2004. {IEEE} copyright signed. 28/10/03 No paper posted - link to TKDE site given},
keywords = {MultiBoosting and Boosting},
publisher = {{IEEE} Computer Society},
related = {multiboosting-and-multi-strategy-ensemble-learning},
}
ABSTRACT Ensemble learning strategies, especially Boosting and Bagging decision trees, have demonstrated impressive capacities to improve the prediction accuracy of base learning algorithms. Further gains have been demonstrated by strategies that combine simple ensemble formation approaches. In this paper, we investigate the hypothesis that the improvement inaccuracy of multi-strategy approaches to ensemble learning is due to an increase in the diversity of ensemble members that are formed. In addition, guided by this hypothesis, we develop three new multi-strategy ensemble-learning techniques. Experimental results in a wide variety of natural domains suggest that these multi-strategy ensemble-learning techniques are, on average, more accurate than their component ensemble learning techniques

Lecture Notes in Computer Science 3339: Proceedings of the 17th Australian Joint Conference on Artificial Intelligence (AI 2004)
Webb, G. I., & Yu, X. (Ed).
Berlin: Springer, 2004.
[Bibtex]

@Proceedings{WebbYu04,
Title = {Lecture Notes in Computer Science 3339: Proceedings of the 17th Australian Joint Conference on Artificial Intelligence (AI 2004)},
Year = {2004},
Editor = {Webb, G. I. and Yu, X.},
Publisher = {Springer},
Series = {Lecture Notes in Computer Science},
Location = {Cairns, Australia}
}
ABSTRACT 

Yang, Y., & Webb, G. I.
Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD'03), Berlin/Heidelberg, pp. 501-512, 2003.

@InProceedings{YangWebb03,
author = {Yang, Y. and Webb, G. I.},
booktitle = {Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD'03)},
title = {Weighted Proportional k-Interval Discretization for Naive-Bayes Classifiers},
year = {2003},
editor = {Whang, K-Y. and Jeon, J. and Shim, K. and Srivastava, J.},
pages = {501-512},
publisher = {Springer-Verlag},
abstract = {The use of different discretization techniques can be expected to affect the bias and variance of a learning algorithm. We call such an effect discretization bias and variance. Proportional k-interval discretization (PKID) tunes discretization bias and variance by adjusting discretized interval size and number proportional to the number of training instances. Theoretical analysis suggests that this is desirable for naive-Bayes classifiers. However PKID has sub-optimal performance when learning from small training data. We argue that this is because PKID equally weighs bias reduction and variance reduction. But for small data, variance reduction can contribute more to lower learning error and thus should be given greater weight than bias reduction. Accordingly we propose weighted proportional k-interval discretization (WPKID), which establishes a more suitable bias and variance trade-off for small data while allowing additional training data to be used to reduce both bias and variance. Our experiments demonstrate that for naive-Bayes classifiers, WPKID improves upon PKID for smaller datasets with significant frequency; and WPKID delivers lower classification error significantly more often than not in comparison to the other three leading alternative discretization techniques studied.},
audit-trail = {Waiting on copy of copyright form from Ying},
keywords = {Discretization for Naive Bayes},
location = {Seoul, Korea},
related = {discretization-for-naive-bayes},
}
ABSTRACT The use of different discretization techniques can be expected to affect the bias and variance of a learning algorithm. We call such an effect discretization bias and variance. Proportional k-interval discretization (PKID) tunes discretization bias and variance by adjusting discretized interval size and number proportional to the number of training instances. Theoretical analysis suggests that this is desirable for naive-Bayes classifiers. However PKID has sub-optimal performance when learning from small training data. We argue that this is because PKID equally weighs bias reduction and variance reduction. But for small data, variance reduction can contribute more to lower learning error and thus should be given greater weight than bias reduction. Accordingly we propose weighted proportional k-interval discretization (WPKID), which establishes a more suitable bias and variance trade-off for small data while allowing additional training data to be used to reduce both bias and variance. Our experiments demonstrate that for naive-Bayes classifiers, WPKID improves upon PKID for smaller datasets with significant frequency; and WPKID delivers lower classification error significantly more often than not in comparison to the other three leading alternative discretization techniques studied.

Association Rules.
Webb, G. I.
In Ye, N. (Ed.), In The Handbook of Data Mining, Chapter 2 (pp. 25-39). Lawrence Erlbaum Associates, 2003.
[Bibtex]

@InCollection{Webb03,
author = {Webb, G. I.},
booktitle = {The Handbook of Data Mining, Chapter 2},
publisher = {Lawrence Erlbaum Associates},
title = {Association Rules},
year = {2003},
editor = {Ye, Nong},
pages = {25-39},
audit-trail = {*},
keywords = {Association Rule Discovery},
}
ABSTRACT 

Butler, S. M., Webb, G. I., & Lewis, R. A.
Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 677-685, 2003.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ButlerWebbLewis03,
author = {Butler, S. M. and Webb, G. I. and Lewis, R. A.},
booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03)},
title = {A Case Study in Feature Invention for Breast Cancer Diagnosis Using X-Ray Scatter Images},
year = {2003},
editor = {Gedeon, T.D. and Fung, L.C.C.},
pages = {677-685},
publisher = {Springer},
abstract = {X-ray mammography is the current method for screening for breast cancer, and like any technique, has its limitations. Several groups have reported differences in the X-ray scattering patterns of normal and tumour tissue from the breast. This gives rise to the hope that X-ray scatter analysis techniques may lead to a more accurate and cost effective method of diagnosing beast cancer which lends itself to automation. This is a particularly challenging exercise due to the inherent complexity of the information content in X-ray scatter patterns from complex heterogenous tissue samples. We use a simple naive Bayes classier, coupled with Equal Frequency Discretization (EFD) as our classification system. High-level features are extracted from the low-level pixel data. This paper reports some preliminary results in the ongoing development of this classification method that can distinguish between the diffraction patterns of normal and cancerous tissue, with particular emphasis on the invention of features for classification.},
doi = {10.1007/978-3-540-24581-0_58},
keywords = {health},
related = {health},
}
ABSTRACT X-ray mammography is the current method for screening for breast cancer, and like any technique, has its limitations. Several groups have reported differences in the X-ray scattering patterns of normal and tumour tissue from the breast. This gives rise to the hope that X-ray scatter analysis techniques may lead to a more accurate and cost effective method of diagnosing beast cancer which lends itself to automation. This is a particularly challenging exercise due to the inherent complexity of the information content in X-ray scatter patterns from complex heterogenous tissue samples. We use a simple naive Bayes classier, coupled with Equal Frequency Discretization (EFD) as our classification system. High-level features are extracted from the low-level pixel data. This paper reports some preliminary results in the ongoing development of this classification method that can distinguish between the diffraction patterns of normal and cancerous tissue, with particular emphasis on the invention of features for classification.

Yang, Y., & Webb, G. I.
Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 440-452, 2003.

@InProceedings{YangWebb03c,
Title = {On Why Discretization Works for Naive-Bayes Classifiers},
Author = {Yang, Y. and Webb, G. I.},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03)},
Year = {2003},
Editor = {Gedeon, T.D. and Fung, L.C.C.},
Pages = {440-452},
Publisher = {Springer},
Abstract = {We investigate why discretization is effective in naive-Bayes learning. We prove a theorem that identifies particular conditions under which discretization will result in naive Bayes classifiers delivering the same probability estimates as would be obtained if the correct probability density functions were employed. We discuss the factors that might affect naive-Bayes classification error under discretization. We suggest that the use of different discretization techniques can affect the classification bias and variance of the generated classifiers, an effect named discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error},
Keywords = {Discretization for Naive Bayes},
Location = {Perth, Australia},
Related = {discretization-for-naive-bayes}
}
ABSTRACT We investigate why discretization is effective in naive-Bayes learning. We prove a theorem that identifies particular conditions under which discretization will result in naive Bayes classifiers delivering the same probability estimates as would be obtained if the correct probability density functions were employed. We discuss the factors that might affect naive-Bayes classification error under discretization. We suggest that the use of different discretization techniques can affect the classification bias and variance of the generated classifiers, an effect named discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error

Webb, G. I.
Proceedings of the Second Australasian Data Mining Conference (AusDM03), Sydney, pp. 1-9, 2003.

@InProceedings{Webb03a,
author = {Webb, G. I.},
booktitle = {Proceedings of the Second Australasian Data Mining Conference (AusDM03)},
title = {Preliminary Investigations into Statistically Valid Exploratory Rule Discovery},
year = {2003},
editor = {Simoff, S.J. and Williams, G.J. and Hegland, M.},
pages = {1-9},
publisher = {University of Technology},
abstract = {Exploratory rule discovery, as exemplified by association rule discovery, has proven very popular. In this paper I investigate issues surrounding the statistical validity of rules found using this approach and methods that might be employed to deliver statistically sound exploratory rule discovery.},
audit-trail = {Submitted to AusDM03. No copyright required. Check key words},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
location = {Canberra, Australia},
related = {statistically-sound-association-discovery},
}
ABSTRACT Exploratory rule discovery, as exemplified by association rule discovery, has proven very popular. In this paper I investigate issues surrounding the statistical validity of rules found using this approach and methods that might be employed to deliver statistically sound exploratory rule discovery.

Wang, Z., Webb, G. I., & Zheng, F.
Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 453-465, 2003.

@InProceedings{WangWebbZheng03,
author = {Wang, Z. and Webb, G. I. and Zheng, F.},
booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence ({AI} 03)},
title = {Adjusting Dependence Relations for Semi-Lazy TAN Classifiers},
year = {2003},
editor = {Gedeon, T.D. and Fung, L.C.C.},
pages = {453-465},
publisher = {Springer},
abstract = {The naive Bayesian classifier is a simple and effective classification method, which assumes a Bayesian network in which each attribute has the class label as its only one parent. But this assumption is not obviously hold in many real world domains. Tree-Augmented Na?ve Bayes (TAN) is a state-of-the-art extension of the naive Bayes, which can express partial dependence relations among attributes. In this paper, we analyze the implementations of two different TAN classifiers and their tree structures. Experiments show how different dependence relations impact on accuracy of TAN classifiers. We present a kind of semi-lazy TAN classifier, which builds a TAN identical to the original TAN at training time, but adjusts the dependence relations for a new test instance at classification time. Our extensive experimental results show that this kind of semi-lazy classifier delivers lower error than the original TAN and is more efficient than SuperParent TAN.},
keywords = {Conditional Probability Estimation},
location = {Perth, Australia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT The naive Bayesian classifier is a simple and effective classification method, which assumes a Bayesian network in which each attribute has the class label as its only one parent. But this assumption is not obviously hold in many real world domains. Tree-Augmented Na?ve Bayes (TAN) is a state-of-the-art extension of the naive Bayes, which can express partial dependence relations among attributes. In this paper, we analyze the implementations of two different TAN classifiers and their tree structures. Experiments show how different dependence relations impact on accuracy of TAN classifiers. We present a kind of semi-lazy TAN classifier, which builds a TAN identical to the original TAN at training time, but adjusts the dependence relations for a new test instance at classification time. Our extensive experimental results show that this kind of semi-lazy classifier delivers lower error than the original TAN and is more efficient than SuperParent TAN.

Shi, H., Wang, Z., Webb, G. I., & Huang, H.
Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD'03), Berlin/Heidelberg, pp. 265-270, 2003.

@InProceedings{ShiWangWebbHuang03,
author = {Shi, H. and Wang, Z. and Webb, G. I. and Huang, H.},
booktitle = {Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD'03)},
title = {A New Restricted Bayesian Network Classifier},
year = {2003},
editor = {Whang, K-Y. and Jeon, J. and Shim, K. and Srivastava, J.},
pages = {265-270},
publisher = {Springer-Verlag},
abstract = {On the basis of examining the existing restricted Bayesian network classifiers, a new Bayes-theorem-based and more strictly restricted Bayesian-network-based classification model DLBAN is proposed, which can be viewed as a double-level Bayesian network augmented naive Bayes classification. The experimental results show that the DLBAN classifier is better than the TAN classifier in the most cases.},
audit-trail = {*},
keywords = {Conditional Probability Estimation},
location = {Seoul, Korea},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT On the basis of examining the existing restricted Bayesian network classifiers, a new Bayes-theorem-based and more strictly restricted Bayesian-network-based classification model DLBAN is proposed, which can be viewed as a double-level Bayesian network augmented naive Bayes classification. The experimental results show that the DLBAN classifier is better than the TAN classifier in the most cases.

Rolfe, B., Hodgson, P., & Webb, G. I.
Intelligence in a Small World - Nanomaterials for the 21st Century. Selected Papers from IPMM-2003, Boca Raton, Florida, 2003.

@InProceedings{RolfeHodgsonWebb03,
author = {Rolfe, B. and Hodgson, P. and Webb, G. I.},
booktitle = {Intelligence in a Small World - Nanomaterials for the 21st Century. Selected Papers from IPMM-2003},
title = {Improving the Prediction of the Roll Separating Force in a Hot Steel Finishing Mill},
year = {2003},
editor = {Meech, J.A.},
publisher = {CRC-Press},
audit-trail = {*},
keywords = {Engineering Applications},
location = {Sendai, Japan},
related = {engineering-applications},
}
ABSTRACT 

Zhang, C., Zhang, S., & Webb, G. I.
Applied Intelligence, 18, 91-104, 2003.
[Bibtex] [Abstract]  → Access on publisher site

@Article{ZhangZhangWebb03,
author = {Zhang, C. and Zhang, S. and Webb, G. I.},
journal = {Applied Intelligence},
title = {Identifying Approximate Itemsets of Interest In Large Databases},
year = {2003},
pages = {91-104},
volume = {18},
abstract = {This paper presents a method for discovering approximate frequent itemsets of interest in large scale databases. This method uses the central limit theorem to increase efficiency, enabling us to reduce the sample size by about half compared to previous approximations. Further efficiency is gained by pruning from the search space uninteresting frequent itemsets. In addition to improving efficiency, this measure also reduces the number of itemsets that the user need consider. The model and algorithm have been implemented and evaluated using both synthetic and real-world databases. Our experimental results demonstrate the efficiency of the approach},
audit-trail = {Link to paper via Kluwer site. No PDF posted},
keywords = {Association Rule Discovery},
publisher = {Springer},
}
ABSTRACT This paper presents a method for discovering approximate frequent itemsets of interest in large scale databases. This method uses the central limit theorem to increase efficiency, enabling us to reduce the sample size by about half compared to previous approximations. Further efficiency is gained by pruning from the search space uninteresting frequent itemsets. In addition to improving efficiency, this measure also reduces the number of itemsets that the user need consider. The model and algorithm have been implemented and evaluated using both synthetic and real-world databases. Our experimental results demonstrate the efficiency of the approach

Rolfe, B., Frayman, Y., Webb, G. I., & Hodgson, P.
Proceedings of the 9th International Conference on Manufacturing Excellence (ICME 03), 2003.

@InProceedings{RolfeFraymanWebbHodgson03,
author = {Rolfe, B. and Frayman, Y. and Webb, G. I. and Hodgson, P.},
booktitle = {Proceedings of the 9th International Conference on Manufacturing Excellence (ICME 03)},
title = {Analysis of Stamping Production Data with View Towards Quality Management},
year = {2003},
keywords = {Engineering Applications},
location = {Melbourne, Australia},
related = {engineering-applications},
}
ABSTRACT 

Webb, G. I., Butler, S., & Newlands, D.
Proceedings of The Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2003), New York, pp. 256-265, 2003.

@InProceedings{WebbButlerNewlands03,
Title = {On Detecting Differences Between Groups},
Author = {Webb, G. I. and Butler, S. and Newlands, D.},
Booktitle = {Proceedings of The Ninth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2003)},
Year = {2003},
Editor = {Domingos, P. and Faloutsos, C. and Senator, T. and Kargupta, H. and Getoor, L.},
Pages = {256-265},
Publisher = {The Association for Computing Machinery},
Abstract = {Understanding the differences between contrasting groups is a fundamental task in data analysis. This realization has led to the development of a new special purpose data mining technique, {\em contrast-set mining}. We undertook a study with a retail collaborator to compare contrast-set mining with existing rule-discovery techniques. To our surprise we observed that straightforward application of an existing commercial rule-discovery system, Magnum Opus, could successfully perform the contrast-set-mining task. This led to the realization that contrast-set mining is a special case of the more general rule-discovery task. We present the results of our study together with a proof of this conclusion},
Audit-trail = {PDF with ACM copyright posted in accordance with conditions of copyright},
Keywords = {OPUS and Association Rule Discovery},
Location = {Washington, DC},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT Understanding the differences between contrasting groups is a fundamental task in data analysis. This realization has led to the development of a new special purpose data mining technique, {\em contrast-set mining}. We undertook a study with a retail collaborator to compare contrast-set mining with existing rule-discovery techniques. To our surprise we observed that straightforward application of an existing commercial rule-discovery system, Magnum Opus, could successfully perform the contrast-set-mining task. This led to the realization that contrast-set mining is a special case of the more general rule-discovery task. We present the results of our study together with a proof of this conclusion

Webb, G. I., & Zhang, S.
Proceedings of the First International NAISO Congress on Autonomous Intelligent Systems (ICAIS 2002), Canada/The Netherlands, 2002.

@InProceedings{WebbZhang02,
Title = {Removing Trivial Associations in Association Rule Discovery},
Author = {Webb, G. I. and Zhang, S.},
Booktitle = {Proceedings of the First International NAISO Congress on Autonomous Intelligent Systems (ICAIS 2002)},
Year = {2002},
Abstract = {Association rule discovery has become one of the most widely applied data mining strategies. Techniques for association rule discovery have been dominated by the frequent itemset strategy as exemplified by the Apriori algorithm. One limitation of this approach is that it provides little opportunity to detect and remove association rules on the basis of relationships between rules. As a result, the association rules discovered are frequently swamped with large numbers of spurious rules that are of little interest to the user. This paper presents association rule discovery techniques that can detect and discard one form of spurious association rule: trivial associations.},
Audit-trail = {Pre-publication PDF posted},
Keywords = {OPUS and Association Rule Discovery},
Location = {Geelong, Australia},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT Association rule discovery has become one of the most widely applied data mining strategies. Techniques for association rule discovery have been dominated by the frequent itemset strategy as exemplified by the Apriori algorithm. One limitation of this approach is that it provides little opportunity to detect and remove association rules on the basis of relationships between rules. As a result, the association rules discovered are frequently swamped with large numbers of spurious rules that are of little interest to the user. This paper presents association rule discovery techniques that can detect and discard one form of spurious association rule: trivial associations.

Yang, Y., & Webb, G. I.
Proceedings of the 2002 Pacific Rim Knowledge Acquisition Workshop (PKAW'02), Tokyo, pp. 159-173, 2002.

@InProceedings{YangWebb02a,
Title = {A Comparative Study of Discretization Methods for Naive-Bayes Classifiers},
Author = {Yang, Y. and Webb, G. I.},
Booktitle = {Proceedings of the 2002 {Pacific} Rim Knowledge Acquisition Workshop (PKAW'02)},
Year = {2002},
Editor = {Yamaguchi, T. and Hoffmann, A. and Motoda, H. and Compton, P.},
Pages = {159-173},
Publisher = {Japanese Society for Artificial Intelligence},
Abstract = {Discretization is a popular approach to handling numeric attributes in machine learning. We argue that the requirements for effective discretization differ between naive-Bayes learning and many other learning algorithms. We evaluate the effectiveness with naive-Bayes classifiers of nine discretization methods, equal width discretization (EWD), equal frequency discretization (EFD), fuzzy discretization (FD), entropy minimization discretization (EMD), iterative discretization (ID), proportional k-interval discretization (PKID), lazy discretization (LD), non-disjoint discretization (NDD) and weighted proportional k-interval discretization (WPKID). It is found that in general naive-Bayes classifiers trained on data preprocessed by LD, NDD or WPKID achieve lower classification error than those trained on data preprocessed by the other discretization methods. But LD can not scale to large data. This study leads to a new discretization method, weighted non-disjoint discretization (WNDD) that combines WPKID and NDD's advantages. Our experiments show that among all the rival discretization methods, WNDD best helps naive-Bayes classifiers reduce average classification error.},
Audit-trail = {*},
Keywords = {Discretization for Naive Bayes},
Location = {Tokyo, Japan},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Discretization is a popular approach to handling numeric attributes in machine learning. We argue that the requirements for effective discretization differ between naive-Bayes learning and many other learning algorithms. We evaluate the effectiveness with naive-Bayes classifiers of nine discretization methods, equal width discretization (EWD), equal frequency discretization (EFD), fuzzy discretization (FD), entropy minimization discretization (EMD), iterative discretization (ID), proportional k-interval discretization (PKID), lazy discretization (LD), non-disjoint discretization (NDD) and weighted proportional k-interval discretization (WPKID). It is found that in general naive-Bayes classifiers trained on data preprocessed by LD, NDD or WPKID achieve lower classification error than those trained on data preprocessed by the other discretization methods. But LD can not scale to large data. This study leads to a new discretization method, weighted non-disjoint discretization (WNDD) that combines WPKID and NDD's advantages. Our experiments show that among all the rival discretization methods, WNDD best helps naive-Bayes classifiers reduce average classification error.

Yang, Y., & Webb, G. I.
Proceedings of the Nineteenth International Conference on Machine Learning (ICML '02), San Francisco, pp. 666-673, 2002.

@InProceedings{YangWebb02b,
Title = {Non-Disjoint Discretization for Naive-Bayes Classifiers},
Author = {Yang, Y. and Webb, G. I.},
Booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML '02)},
Year = {2002},
Editor = {Sammut, C. and Hoffmann, A.G.},
Pages = {666-673},
Publisher = {Morgan Kaufmann},
Abstract = {Previous discretization techniques have discretized numeric attributes into disjoint intervals. We argue that this is neither necessary nor appropriate for naive-Bayes classifiers. The analysis leads to a new discretization method, Non-Disjoint Discretization (NDD). NDD forms overlapping intervals for a numeric attribute, always locating a value toward the middle of its discretized interval to obtain more reliable probability estimation. It also adjusts the number and size of discretized intervals to the number of training instances, seeking an appropriate trade-off between bias and variance of probability estimation. We justify NDD in theory and test it on a wide cross-section of datasets. Our experimental results suggest that for naive-Bayes classifiers, NDD works better than alternative discretization approaches.},
Audit-trail = {Posted by Ying at http://www.cs.uvm.edu/~yyang/ndd.pdf No link on GW page - 9/2/05 requested permission},
Keywords = {Discretization for Naive Bayes},
Location = {Sydney, Australia},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Previous discretization techniques have discretized numeric attributes into disjoint intervals. We argue that this is neither necessary nor appropriate for naive-Bayes classifiers. The analysis leads to a new discretization method, Non-Disjoint Discretization (NDD). NDD forms overlapping intervals for a numeric attribute, always locating a value toward the middle of its discretized interval to obtain more reliable probability estimation. It also adjusts the number and size of discretized intervals to the number of training instances, seeking an appropriate trade-off between bias and variance of probability estimation. We justify NDD in theory and test it on a wide cross-section of datasets. Our experimental results suggest that for naive-Bayes classifiers, NDD works better than alternative discretization approaches.

Rolfe, B., Frayman, Y., Hodgson, P., & Webb, G. I.
Proceedings of the IASTED International Conference on Artificial Intelligence and Applications (AIA 2002), Calgary, Canada, pp. 155-159, 2002.

@InProceedings{RolfeFraymanHodgsonWebb02,
Title = {Fault Detection in a Cold Forging Process Through Feature Extraction with a Neural Network},
Author = {Rolfe, B. and Frayman, Y. and Hodgson, P. and Webb, G. I.},
Booktitle = {Proceedings of the IASTED International Conference on Artificial Intelligence and Applications ({AIA} 2002)},
Year = {2002},
Pages = {155-159},
Publisher = {ACTA Press},
Abstract = {This paper investigates the application of neural networks to the recognition of lubrication defects typical to an industrial cold forging process employed by fastener manufacturers. The accurate recognition of lubrication errors, such as coating not being applied properly or damaged during material handling, is very important to the quality of the final product in fastener manufacture. Lubrication errors lead to increased forging loads and premature tool failure, as well as to increased defect sorting and the re-processing of the coated rod. The lubrication coating provides a barrier between the work material and the die during the drawing operation; moreover it needs be sufficiently robust to remain on the wire during the transfer to the cold forging operation. In the cold forging operation the wire undergoes multi-stage deformation without the application of any additional lubrication. Four types of lubrication errors, typical to production of fasteners, were introduced to a set of sample rods, which were subsequently drawn under laboratory conditions. The drawing force was measured, from which a limited set of features was extracted. The neural network based model learned from these features is able to recognize all types of lubrication errors to a high accuracy. The overall accuracy of the neural network model is around 98% with almost uniform distribution of errors between all four errors and the normal condition.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Benalm�dena, Spain},
Related = {engineering-applications}
}
ABSTRACT This paper investigates the application of neural networks to the recognition of lubrication defects typical to an industrial cold forging process employed by fastener manufacturers. The accurate recognition of lubrication errors, such as coating not being applied properly or damaged during material handling, is very important to the quality of the final product in fastener manufacture. Lubrication errors lead to increased forging loads and premature tool failure, as well as to increased defect sorting and the re-processing of the coated rod. The lubrication coating provides a barrier between the work material and the die during the drawing operation; moreover it needs be sufficiently robust to remain on the wire during the transfer to the cold forging operation. In the cold forging operation the wire undergoes multi-stage deformation without the application of any additional lubrication. Four types of lubrication errors, typical to production of fasteners, were introduced to a set of sample rods, which were subsequently drawn under laboratory conditions. The drawing force was measured, from which a limited set of features was extracted. The neural network based model learned from these features is able to recognize all types of lubrication errors to a high accuracy. The overall accuracy of the neural network model is around 98% with almost uniform distribution of errors between all four errors and the normal condition.

Webb, G. I., & Brain, D.
Proceedings of the 2002 Pacific Rim Knowledge Acquisition Workshop (PKAW'02), Tokyo, pp. 117-130, 2002.

@InProceedings{WebbBrain02,
author = {Webb, G. I. and Brain, D.},
booktitle = {Proceedings of the 2002 {Pacific} Rim Knowledge Acquisition Workshop (PKAW'02)},
title = {Generality is Predictive of Prediction Accuracy},
year = {2002},
editor = {Yamaguchi, T. and Hoffmann, A. and Motoda, H. and Compton, P.},
pages = {117-130},
publisher = {Japanese Society for Artificial Intelligence},
abstract = {There has been a dearth of research into the relative impacts of alternative high level learning biases. This paper presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. It is argued that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. It is also argued that the accuracy on unseen cases of the more specific rule will tend to be closer to the accuracy obtained on training data than will the accuracy of the more general rule. Experimental evidence is provided in support of these hypotheses. We argue that these hypotheses can be of use in selecting appropriate learning biases to achieve specific learning objectives.},
audit-trail = {*},
keywords = {Generality},
location = {Tokyo, Japan},
related = {generality-is-predictive-of-prediction-accuracy},
}
ABSTRACT There has been a dearth of research into the relative impacts of alternative high level learning biases. This paper presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. It is argued that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. It is also argued that the accuracy on unseen cases of the more specific rule will tend to be closer to the accuracy obtained on training data than will the accuracy of the more general rule. Experimental evidence is provided in support of these hypotheses. We argue that these hypotheses can be of use in selecting appropriate learning biases to achieve specific learning objectives.

Pearce, J., Webb, G. I., Shaw, R., & Garner, B.
Proceedings of the IEEE International Conference on Data Mining (ICDM-2002), Los Alamitos, CA, pp. 490-497, 2002.

@InProceedings{PearceWebbShawGarner02b,
Title = {A Framework for Experimentation and Self Learning in Continuous Database Marketing},
Author = {Pearce, J. and Webb, G. I. and Shaw, R. and Garner, B.},
Booktitle = {Proceedings of the {IEEE} International Conference on Data Mining (ICDM-2002)},
Year = {2002},
Pages = {490-497},
Publisher = {{IEEE} Computer Society},
Abstract = {We present a method for continuous database marketing that identifies target customers for a number of marketing offers using predictive models. The algorithm then selects the appropriate offer for the customer. Experimental design principles are encapsulated to capture more information that will be used to monitor and refine the predictive models. The updated predictive models are then used for the next round of marketing offers.},
Audit-trail = {http://csdl.computer.org/comp/proceedings/icdm/2002/1754/00/1754toc.htm},
Location = {Maebashi City, Japan}
}
ABSTRACT We present a method for continuous database marketing that identifies target customers for a number of marketing offers using predictive models. The algorithm then selects the appropriate offer for the customer. Experimental design principles are encapsulated to capture more information that will be used to monitor and refine the predictive models. The updated predictive models are then used for the next round of marketing offers.

Webb, G. I.
In Leondes, C. T. (Ed.), In Expert Systems (, Vol. 3, pp. 937-959). San Diego, CA: Academic Press, 2002.

@InCollection{Webb02,
author = {Webb, G. I.},
booktitle = {Expert Systems},
title = {Integrating Machine Learning with Knowledge Acquisition},
year = {2002},
editor = {Leondes, C. T.},
pages = {937-959},
volume = {3},
audit-trail = {23/8 waiting on permission to post PDF. Received permission and posted PDF},
keywords = {Machine Learning with Knowledge Acquisition from Experts and Machine Learning},
related = {interactive-machine-learning},
}
ABSTRACT 

Brain, D., & Webb, G. I.
Lecture Notes in Computer Science 2431: Principles of Data Mining and Knowledge Discovery: Proceedings of the Sixth European Conference (PKDD 2002), Berlin/Heidelberg, pp. 62-73, 2002.

@InProceedings{BrainWebb02,
author = {Brain, D. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 2431: Principles of Data Mining and Knowledge Discovery: Proceedings of the Sixth European Conference (PKDD 2002)},
title = {The Need for Low Bias Algorithms in Classification Learning From Large Data Sets},
year = {2002},
pages = {62-73},
publisher = {Springer-Verlag},
abstract = {This paper reviews the appropriateness for application to large data sets of standard machine learning algorithms, which were mainly developed in the context of small data sets. Sampling and parallelization have proved useful means for reducing computation time when learning from large data sets. However, such methods assume that algorithms that were designed for use with what are now considered small data sets are also fundamentally suitable for large data sets. It is plausible that optimal learning from large data sets requires a different type of algorithm to optimal learning from small data sets. This paper investigates one respect in which data set size may affect the requirements of a learning algorithm � the bias plus variance decomposition of classification error. Experiments show that learning from large data sets may be more effective when using an algorithm that places greater emphasis on bias management, rather than variance management},
keywords = {Learning from large datasets and Bias-Variance},
location = {Helsinki, Finland},
related = {learning-from-large-datasets},
}
ABSTRACT This paper reviews the appropriateness for application to large data sets of standard machine learning algorithms, which were mainly developed in the context of small data sets. Sampling and parallelization have proved useful means for reducing computation time when learning from large data sets. However, such methods assume that algorithms that were designed for use with what are now considered small data sets are also fundamentally suitable for large data sets. It is plausible that optimal learning from large data sets requires a different type of algorithm to optimal learning from small data sets. This paper investigates one respect in which data set size may affect the requirements of a learning algorithm � the bias plus variance decomposition of classification error. Experiments show that learning from large data sets may be more effective when using an algorithm that places greater emphasis on bias management, rather than variance management

Wang, Z., & Webb, G. I.
Proceedings of the IEEE International Conference on Data Mining (ICDM-2002), Los Alamitos, CA, pp. 775-778, 2002.

@InProceedings{WangWebb02,
author = {Wang, Z. and Webb, G. I.},
booktitle = {Proceedings of the {IEEE} International Conference on Data Mining (ICDM-2002)},
title = {Comparison of Lazy Bayesian Rule Learning and Tree-Augmented Bayesian Learning},
year = {2002},
pages = {775-778},
publisher = {{IEEE} Computer Society},
abstract = {The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. Among these, Lazy Bayesian Rules (LBR) and Tree-Augmented Na?ve-Bayes (TAN) have demonstrated strong prediction accuracy. However, their relative performance has never been evaluated. This paper compares and contrasts these two techniques, finding that they have comparable accuracy and hence should be selected according to computational profile. LBR is desirable when small numbers of objects are to be classified while TAN is desirable when large numbers of objects are to be classified},
audit-trail = {http://csdl.computer.org/comp/proceedings/icdm/2002/1754/00/1754toc.htm},
keywords = {Conditional Probability Estimation},
location = {Maebashi City, Japan},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. Among these, Lazy Bayesian Rules (LBR) and Tree-Augmented Na?ve-Bayes (TAN) have demonstrated strong prediction accuracy. However, their relative performance has never been evaluated. This paper compares and contrasts these two techniques, finding that they have comparable accuracy and hence should be selected according to computational profile. LBR is desirable when small numbers of objects are to be classified while TAN is desirable when large numbers of objects are to be classified

Wang, Z., & Webb, G. I.
Proceedings of the First Australasian Data Mining Workshop (AusDM02), Sydney, pp. 57-63, 2002.

@InProceedings{WangWebb02b,
author = {Wang, Z. and Webb, G. I.},
booktitle = {Proceedings of the First Australasian Data Mining Workshop (AusDM02)},
title = {A Heuristic Lazy Bayesian Rules Algorithm},
year = {2002},
editor = {Simoff, S. J and Williams, G. J and Hegland, M.},
pages = {57-63},
publisher = {University of Technology},
abstract = {Lazy Bayesian rule has demonstrated outstanding classification accuracy. However, it has high computational overheads when large numbers of instances are classified from a single training set. We compare lazy Bayesian rule and the tree-augmented Bayesian classifier, and present a new heuristic lazy Bayesian rule classifier that combines elements of the two. It requires less computation than lazy Bayesian rule, but demonstrates similar prediction accuracy.},
audit-trail = {*},
keywords = {Conditional Probability Estimation},
location = {Canberra, Australia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Lazy Bayesian rule has demonstrated outstanding classification accuracy. However, it has high computational overheads when large numbers of instances are classified from a single training set. We compare lazy Bayesian rule and the tree-augmented Bayesian classifier, and present a new heuristic lazy Bayesian rule classifier that combines elements of the two. It requires less computation than lazy Bayesian rule, but demonstrates similar prediction accuracy.

Frayman, Y., Rolfe, B., Hodgson, P., & Webb, G. I.
Proceedings of the Second IASTED International Conference on Artificial Intelligence and Applications (AIA '02), Calgary, Canada, pp. 143-148, 2002.

@InProceedings{FraymanRolfeHodgsonWebb02c,
Title = {Predicting The Rolling Force in Hot Steel Rolling Mill using an Ensemble Model},
Author = {Frayman, Y. and Rolfe, B. and Hodgson, P. and Webb, G. I.},
Booktitle = {Proceedings of the Second IASTED International Conference on Artificial Intelligence and Applications (AIA '02)},
Year = {2002},
Pages = {143-148},
Publisher = {ACTA Press},
Abstract = {Accurate prediction of the roll separating force is critical to assuring the quality of the final product in steel manufacturing. This paper presents an ensemble model that addresses these concerns. A stacked generalisation approach to ensemble modeling is used with two sets of the ensemble model members, the first set being learnt from the current input-output data of the hot rolling finishing mill, while another uses the available information on the previous coil in addition to the current information. Both sets of ensemble members include linear regression, multilayer perceptron, and k-nearest neighbor algorithms. A competitive selection model (multilayer perceptron) is then used to select the output from one of the ensemble members to be the final output of the ensemble model. The ensemble model created by such a stacked generalization is able to achieve extremely high accuracy in predicting the roll separation force with the average relative accuracy being within 1% of the actual measured roll force.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Benalm�dena, Spain},
Related = {engineering-applications}
}
ABSTRACT Accurate prediction of the roll separating force is critical to assuring the quality of the final product in steel manufacturing. This paper presents an ensemble model that addresses these concerns. A stacked generalisation approach to ensemble modeling is used with two sets of the ensemble model members, the first set being learnt from the current input-output data of the hot rolling finishing mill, while another uses the available information on the previous coil in addition to the current information. Both sets of ensemble members include linear regression, multilayer perceptron, and k-nearest neighbor algorithms. A competitive selection model (multilayer perceptron) is then used to select the output from one of the ensemble members to be the final output of the ensemble model. The ensemble model created by such a stacked generalization is able to achieve extremely high accuracy in predicting the roll separation force with the average relative accuracy being within 1% of the actual measured roll force.

Webb, G. I., Boughton, J., & Wang, Z.
Proceedings of the First Australasian Data Mining Workshop (AusDM02), Sydney, pp. 65-73, 2002.

@InProceedings{WebbBoughtonWang02,
Title = {Averaged One-Dependence Estimators: Preliminary Results},
Author = {Webb, G. I. and Boughton, J. and Wang, Z.},
Booktitle = {Proceedings of the First Australasian Data Mining Workshop (AusDM02)},
Year = {2002},
Editor = {Simoff, S.J. and Williams, G.J. and Hegland, M.},
Pages = {65-73},
Publisher = {University of Technology},
Abstract = {Naive Bayes is a simple, computationally efficient and remarkably accurate approach to classification learning. These properties have led to its wide deployment in many online applications. However, it is based on an assumption that all attributes are conditionally independent given the class. This assumption leads to decreased accuracy in some applications. AODE overcomes the attribute independence assumption of naive Bayes by averaging over all models in which all attributes depend upon the class and a single other attribute. The resulting classification learning algorithm for nominal data is computationally efficient and achieves very low error rates.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Canberra, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Naive Bayes is a simple, computationally efficient and remarkably accurate approach to classification learning. These properties have led to its wide deployment in many online applications. However, it is based on an assumption that all attributes are conditionally independent given the class. This assumption leads to decreased accuracy in some applications. AODE overcomes the attribute independence assumption of naive Bayes by averaging over all models in which all attributes depend upon the class and a single other attribute. The resulting classification learning algorithm for nominal data is computationally efficient and achieves very low error rates.

Frayman, Y., Rolfe, B., & Webb, G. I.
Proceedings of the Design Engineering Technical Conferences and Computer and Information in Engineering Conference (DETC'02/ASME 2002), New York, pp. 1-8, 2002.

@InProceedings{FraymanRolfeWebb02b,
Title = {Improving an Inverse Model of Sheet Metal Forming by Neural Network Based Regression},
Author = {Frayman, Y. and Rolfe, B. and Webb, G. I.},
Booktitle = {Proceedings of the Design Engineering Technical Conferences and Computer and Information in Engineering Conference (DETC'02/ASME 2002)},
Year = {2002},
Pages = {1-8},
Publisher = {ASME Press},
Abstract = {The inverse model for a sheet metal forming process aims to determine the initial parameter levels required to form the final formed shape. This is a difficult problem that is usually approached by traditional methods such a finite element analysis. Formulating the problem as a classification problem makes is possible to use a well established classification algorithms such as decision trees. The classification is, however, generally based on a winner-takes-all approach when associating the output value with the corresponding class. On the other hand when formulating the problem as a regression task, all the output values are combined to produce the corresponding class value. For a multi-class problem, this may result in very different associations between the output of the model and the corresponding class. Such formulation makes it possible to use a well known regression algorithms such as neural networks.In this paper, we develop a neural network based inverse model of a sheet forming process, and compare its performance with that of a linear model. Both models are used in two modes: classification mode and a function estimation mode to investigate the advantage of re-formulating the problem as function estimation. This results in large improvements in the recognition rate of set-up parameters of a sheet metal forming process for both models, with a neural network model achieving much more accurate parameters recognition than a linear model},
Audit-trail = {*},
Keywords = {Engineering Applications},
Related = {engineering-applications}
}
ABSTRACT The inverse model for a sheet metal forming process aims to determine the initial parameter levels required to form the final formed shape. This is a difficult problem that is usually approached by traditional methods such a finite element analysis. Formulating the problem as a classification problem makes is possible to use a well established classification algorithms such as decision trees. The classification is, however, generally based on a winner-takes-all approach when associating the output value with the corresponding class. On the other hand when formulating the problem as a regression task, all the output values are combined to produce the corresponding class value. For a multi-class problem, this may result in very different associations between the output of the model and the corresponding class. Such formulation makes it possible to use a well known regression algorithms such as neural networks.In this paper, we develop a neural network based inverse model of a sheet forming process, and compare its performance with that of a linear model. Both models are used in two modes: classification mode and a function estimation mode to investigate the advantage of re-formulating the problem as function estimation. This results in large improvements in the recognition rate of set-up parameters of a sheet metal forming process for both models, with a neural network model achieving much more accurate parameters recognition than a linear model

Frayman, Y., Rolfe, B., & Webb, G. I.
Lecture Notes in Computer Science Vol. 2557: Proceedings of the 15th Australian Joint Conference on Artificial Intelligence (AI 02), Berlin/Heidelberg, pp. 511-522, 2002.

@InProceedings{FraymanRolfeWebb02,
Title = {Solving Regression Problems using Competitive Ensemble Models},
Author = {Frayman, Y. and Rolfe, B. and Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science Vol. 2557: Proceedings of the 15th Australian Joint Conference on Artificial Intelligence (AI 02)},
Year = {2002},
Editor = {McKay, B. and Slaney, J.K.},
Pages = {511-522},
Publisher = {Springer},
Abstract = {The use of ensemble models in many problem domains has increased significantly in the last few years. The ensemble modelling, in particularly boosting, has shown a great promise in improving predictive performance of a model. Combining the ensemble members is normally done in a co-operative fashion where each of the ensemble members performs the same task and their predictions are aggregated to obtain the improved performance. However, it is also possible to combine the ensemble members in a competitive fashion where the best prediction of a relevant ensemble member is selected for a particular input. This option has been previously somewhat overlooked. The aim of this article is to investigate and compare the competitive and co-operative approaches to combining the models in the ensemble. A comparison is made between a competitive ensemble model and that of MARS with bagging, mixture of experts, hierarchical mixture of experts and a neural network ensemble over several public domain regression problems that have a high degree of nonlinearity and noise. The empirical results show a substantial advantage of competitive learning versus the co-operative learning for all the regression problems investigated. The requirements for creating the efficient ensembles and the available guidelines are also discussed.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Canberra, Australia},
Related = {engineering-applications}
}
ABSTRACT The use of ensemble models in many problem domains has increased significantly in the last few years. The ensemble modelling, in particularly boosting, has shown a great promise in improving predictive performance of a model. Combining the ensemble members is normally done in a co-operative fashion where each of the ensemble members performs the same task and their predictions are aggregated to obtain the improved performance. However, it is also possible to combine the ensemble members in a competitive fashion where the best prediction of a relevant ensemble member is selected for a particular input. This option has been previously somewhat overlooked. The aim of this article is to investigate and compare the competitive and co-operative approaches to combining the models in the ensemble. A comparison is made between a competitive ensemble model and that of MARS with bagging, mixture of experts, hierarchical mixture of experts and a neural network ensemble over several public domain regression problems that have a high degree of nonlinearity and noise. The empirical results show a substantial advantage of competitive learning versus the co-operative learning for all the regression problems investigated. The requirements for creating the efficient ensembles and the available guidelines are also discussed.

Pearce, J., Webb, G. I., Shaw, R., & Garner, B.
Proceedings of the Australian and New Zealand Marketing Academy Conference (ANZMAC 02), Geelong, Victoria, pp. 2941-2948, 2002.

@InProceedings{PearceWebbShawGarner02,
author = {Pearce, J. and Webb, G. I. and Shaw, R. and Garner, B.},
booktitle = {Proceedings of the Australian and New Zealand Marketing Academy Conference (ANZMAC 02)},
title = {A Systemic Approach to the Database Marketing Process},
year = {2002},
pages = {2941-2948},
publisher = {Deakin University (CD Rom)},
abstract = {The role of database marketing (DBM) has become increasingly important for organisations that have large databases of information on customers with whom they deal directly. At the same time, DBM models used in practice have increased in sophistication. This paper examines a systemic view of DBM and the role of analytical techniques within DBM. It extends existing process models to develop a systemic model that encompasses the increased complexity of DBM in practice. The systemic model provides a framework to integrate data mining, experimental design and prioritisation decisions. This paper goes on to identify opportunities for research in DBM, including DBM process models used in practice, the use of evolutionary operations techniques in DBM, prioritisation decisions, and the factors that surround the uptake of DBM.},
audit-trail = {*},
location = {Geelong, Australia},
}
ABSTRACT The role of database marketing (DBM) has become increasingly important for organisations that have large databases of information on customers with whom they deal directly. At the same time, DBM models used in practice have increased in sophistication. This paper examines a systemic view of DBM and the role of analytical techniques within DBM. It extends existing process models to develop a systemic model that encompasses the increased complexity of DBM in practice. The systemic model provides a framework to integrate data mining, experimental design and prioritisation decisions. This paper goes on to identify opportunities for research in DBM, including DBM process models used in practice, the use of evolutionary operations techniques in DBM, prioritisation decisions, and the factors that surround the uptake of DBM.

Yang, Y., & Webb, G. I.
Lecture Notes in Computer Science 2167: Proceedings of the 12th European Conference on Machine Learning (ECML'01), Berlin/Heidelberg, pp. 564-575, 2001.

@InProceedings{YangWebb01,
author = {Yang, Y. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 2167: Proceedings of the 12th European Conference on Machine Learning (ECML'01)},
title = {Proportional K-Interval Discretization for Naive-Bayes Classifiers},
year = {2001},
editor = {DeRaedt, L. and Flach, P. A.},
pages = {564-575},
publisher = {Springer-Verlag},
abstract = {This paper argues that two commonly-used discretization approaches, fixed k-interval discretization and entropy-based discretization have sub-optimal characteristics for naive-Bayes classification. This analysis leads to a new discretization method, Proportional k-Interval Discretization (PKID), which adjusts the number and size of discretized intervals to the number of training instances, thus seeks an appropriate trade-off between the bias and variance of the probability estimation for naive-Bayes classifiers. We justify PKID in theory, as well as test it on a wide cross-section of datasets. Our experimental results suggest that in comparison to its alternatives, PKID provides naive-Bayes classifiers competitive classification performance for smaller datasets and better classification performance for larger datasets.},
keywords = {Discretization for Naive Bayes},
location = {Freiburg, Germany},
related = {discretization-for-naive-bayes},
}
ABSTRACT This paper argues that two commonly-used discretization approaches, fixed k-interval discretization and entropy-based discretization have sub-optimal characteristics for naive-Bayes classification. This analysis leads to a new discretization method, Proportional k-Interval Discretization (PKID), which adjusts the number and size of discretized intervals to the number of training instances, thus seeks an appropriate trade-off between the bias and variance of the probability estimation for naive-Bayes classifiers. We justify PKID in theory, as well as test it on a wide cross-section of datasets. Our experimental results suggest that in comparison to its alternatives, PKID provides naive-Bayes classifiers competitive classification performance for smaller datasets and better classification performance for larger datasets.

Webb, G. I., & Zhang, S.
Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01), Berlin, pp. 605-618, 2001.

@InProceedings{WebbZhang01,
author = {Webb, G. I. and Zhang, S.},
booktitle = {Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01)},
title = {Further Pruning for Efficient Association Rule Discovery},
year = {2001},
editor = {Stumptner, M. and Corbett, D. and Brooks, M.J.},
pages = {605-618},
publisher = {Springer},
abstract = {The Apriori algorithm's frequent itemset approach has become the standard approach to discovering association rules. However, the computation requirements of the frequent itemset approach are infeasible for dense data and the approach is unable to discover infrequent associations. OPUS\_AR is an efficient algorithm for rule discovery that does not utilize frequent itemsets and hence avoids these problems. It can reduce search time by using additional constraints on the search space as well as constraints on itemset frequency. However, the effectiveness of the pruning rules used during search will determine the efficiency of its search. This paper presents and analyzes pruning rules for use with OPUS\_AR. We demonstrate that application of OPUS\_AR is feasible for a number of datasets for which application of the frequent itemset approach is infeasible and that the new pruning rules can reduce compute time by more than 40%.},
audit-trail = {*},
keywords = {OPUS and Association Rule Discovery},
related = {filtered-top-k-association-discovery},
}
ABSTRACT The Apriori algorithm's frequent itemset approach has become the standard approach to discovering association rules. However, the computation requirements of the frequent itemset approach are infeasible for dense data and the approach is unable to discover infrequent associations. OPUS_AR is an efficient algorithm for rule discovery that does not utilize frequent itemsets and hence avoids these problems. It can reduce search time by using additional constraints on the search space as well as constraints on itemset frequency. However, the effectiveness of the pruning rules used during search will determine the efficiency of its search. This paper presents and analyzes pruning rules for use with OPUS_AR. We demonstrate that application of OPUS_AR is feasible for a number of datasets for which application of the frequent itemset approach is infeasible and that the new pruning rules can reduce compute time by more than 40%.

Webb, G. I., Pazzani, M. J., & Billsus, D.
User Modeling and User-Adapted Interaction, 11, 19-20, 2001.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@Article{WebbPazzaniBillsus01,
Title = {Machine learning for user modeling},
Author = {Webb, G. I. and Pazzani, M. J. and Billsus, D.},
Journal = {User Modeling and User-Adapted Interaction},
Year = {2001},
Pages = {19-20},
Volume = {11},
Abstract = {At first blush, user modeling appears to be a prime candidate for straight forward application of standard machine learning techniques. Observations of the user's behavior can provide training examples that a machine learning system can use to form a model designed to predict future actions. However, user modeling poses a number of challenges for machine learning that have hindered its application in user modeling, including: the need for large data sets; the need for labelled data; concept drift; and computational complexity. This paper examines each of these issues and reviews approaches to resolving them.},
Audit-trail = {Link to pdf via UMUAI site. Also available at http://www.kluweronline.com/issn/0924-1868},
Doi = {10.1023/A:1011117102175},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {Springer},
Related = {feature-based-modeling}
}
ABSTRACT At first blush, user modeling appears to be a prime candidate for straight forward application of standard machine learning techniques. Observations of the user's behavior can provide training examples that a machine learning system can use to form a model designed to predict future actions. However, user modeling poses a number of challenges for machine learning that have hindered its application in user modeling, including: the need for large data sets; the need for labelled data; concept drift; and computational complexity. This paper examines each of these issues and reviews approaches to resolving them.

Webb, G. I.
Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01), Berlin/Heidelberg, pp. 545-556, 2001.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{Webb01b,
author = {Webb, G. I.},
booktitle = {Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01)},
title = {Candidate Elimination Criteria for Lazy Bayesian Rules},
year = {2001},
editor = {Stumptner, M. and Corbett, D. and Brooks, M.J.},
pages = {545-556},
publisher = {Springer},
abstract = {Lazy Bayesian Rules modify naive Bayesian classification to undo elements of the harmful attribute independence assumption. It has been shown to provide classification error comparable to boosting decision trees. This paper explores alternatives to the candidate elimination criterion employed within Lazy Bayesian Rules. Improvements over naive Bayes are consistent so long as the candidate elimination criteria ensures there is sufficient data for accurate probability estimation. However, the original candidate elimination criterion is demonstrated to provide better overall error reduction than the use of a minimum data subset size criterion.},
audit-trail = {*},
doi = {10.1007%2F3-540-45656-2_47},
keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Lazy Bayesian Rules modify naive Bayesian classification to undo elements of the harmful attribute independence assumption. It has been shown to provide classification error comparable to boosting decision trees. This paper explores alternatives to the candidate elimination criterion employed within Lazy Bayesian Rules. Improvements over naive Bayes are consistent so long as the candidate elimination criteria ensures there is sufficient data for accurate probability estimation. However, the original candidate elimination criterion is demonstrated to provide better overall error reduction than the use of a minimum data subset size criterion.

Webb, G. I.
Proceedings of the Seventh ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2001)[short paper], New York, pp. 383-388, 2001.
[Bibtex] [Abstract]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{Webb01a,
author = {Webb, G. I.},
booktitle = {Proceedings of the Seventh {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2001)[short paper]},
title = {Discovering Associations with Numeric Variables},
year = {2001},
editor = {Provost, F. and Srikant, R.},
pages = {383-388},
publisher = {The Association for Computing Machinery},
abstract = {This paper further develops Aumann and Lindell's [3] proposal for a variant of association rules for which the consequent is a numeric variable. It is argued that these rules can discover useful interactions with numeric data that cannot be discovered directly using traditional association rules with discretization. Alternative measures for identifying interesting rules are proposed. Efficient algorithms are presented that enable these rules to be discovered for dense data sets for which application of Auman and Lindell's algorithm is infeasible.},
audit-trail = {*},
keywords = {Association Rule Discovery and statistically sound discovery and OPUS and Impact Rules},
location = {San Francisco, CA},
related = {impact-rules},
url = {http://dl.acm.org/authorize?19861},
}
ABSTRACT This paper further develops Aumann and Lindell's [3] proposal for a variant of association rules for which the consequent is a numeric variable. It is argued that these rules can discover useful interactions with numeric data that cannot be discovered directly using traditional association rules with discretization. Alternative measures for identifying interesting rules are proposed. Efficient algorithms are presented that enable these rules to be discovered for dense data sets for which application of Auman and Lindell's algorithm is infeasible.

Implementation of Lazy Bayesian Rules in the Weka System.
Wang, Z., Webb, G. I., & Dai, H.
Software Technology Catering for 21st Century: Proceedings of the International Symposium on Future Software Technology (ISFST2001), Tokyo, pp. 204-208, 2001.
[Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebbDai01,
Title = {Implementation of Lazy Bayesian Rules in the Weka System},
Author = {Wang, Z. and Webb, G. I. and Dai, H.},
Booktitle = {Software Technology Catering for 21st Century: Proceedings of the International Symposium on Future Software Technology (ISFST2001)},
Year = {2001},
Pages = {204-208},
Publisher = {Software Engineers Association},
Abstract = {The na?ve Bayesian classification algorithms were shown to be computationally efficient and surprisingly accurate when the conditional independence assumption on which they are based is violated. The lazy Bayesian rule is the application of lazy learning techniques to Bayesian tree induction, which supports a weaker conditional attribute independence assumption. The Weka system is a full, industrial-strength implementation of essentially almost the state-of-the-art machine learning techniques, and it contains a framework, in the form of a Java class library, which supports applications that use embedded machine learning and even the implementation of new learning schemes. In this paper, we mainly discuss the implementation of the algorithm of lazy Bayesian rule in Weka System, and introduce all the methods to be used in the Java class. This is the first lazy learning scheme implemented in Weka System.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation},
Location = {Zheng Zhou, China},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The na?ve Bayesian classification algorithms were shown to be computationally efficient and surprisingly accurate when the conditional independence assumption on which they are based is violated. The lazy Bayesian rule is the application of lazy learning techniques to Bayesian tree induction, which supports a weaker conditional attribute independence assumption. The Weka system is a full, industrial-strength implementation of essentially almost the state-of-the-art machine learning techniques, and it contains a framework, in the form of a Java class library, which supports applications that use embedded machine learning and even the implementation of new learning schemes. In this paper, we mainly discuss the implementation of the algorithm of lazy Bayesian rule in Weka System, and introduce all the methods to be used in the Java class. This is the first lazy learning scheme implemented in Weka System.

Smith, P. A., & Webb, G. I.
Journal of Educational Computing Research, 22(2), 187-215, 2000.

@Article{SmithWebb00,
Title = {The Efficacy of a Low-Level Program Visualization Tool for Teaching Programming Concepts to Novice C Programmers},
Author = {Smith, P. A. and Webb, G. I.},
Journal = {Journal of Educational Computing Research},
Year = {2000},
Number = {2},
Pages = {187-215},
Volume = {22},
Audit-trail = {Link to pdf via Baywood Publishing Company},
Keywords = {Program Visualisation},
Publisher = {Baywood Publishing},
Related = {program-visualisation}
}
ABSTRACT 

Webb, G. I.
Machine Learning, 40(2), 159-196, 2000.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb00a,
Title = {MultiBoosting: A Technique for Combining Boosting and Wagging},
Author = {Webb, G. I.},
Journal = {Machine Learning},
Year = {2000},
Number = {2},
Pages = {159-196},
Volume = {40},
Abstract = {MultiBoosting is an extension to the highly successful AdaBoost technique for forming decision committees. MultiBoosting can be viewed as combining AdaBoost with wagging. It is able to harness both AdaBoost's high bias and variance reduction with wagging's superior variance reduction. Using C4.5 as the base learning algorithm, Multi-boosting is demonstrated to produce decision committees with lower error than either AdaBoost or wagging significantly more often than the reverse over a large representative cross-section of UCI data sets. It offers the further advantage over AdaBoost of suiting parallel execution.},
Audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF posted 30/10/03},
Doi = {10.1023/A:1007659514849},
Keywords = {MultiBoosting and Boosting and Bias-Variance},
Publisher = {Springer},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT MultiBoosting is an extension to the highly successful AdaBoost technique for forming decision committees. MultiBoosting can be viewed as combining AdaBoost with wagging. It is able to harness both AdaBoost's high bias and variance reduction with wagging's superior variance reduction. Using C4.5 as the base learning algorithm, Multi-boosting is demonstrated to produce decision committees with lower error than either AdaBoost or wagging significantly more often than the reverse over a large representative cross-section of UCI data sets. It offers the further advantage over AdaBoost of suiting parallel execution.

Webb, G. I.
Proceedings of the Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2000), New York, pp. 99-107, 2000.

@InProceedings{Webb00b,
Title = {Efficient Search for Association Rules},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the Sixth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2000)},
Year = {2000},
Editor = {Ramakrishnan, R. and Stolfo, S.},
Pages = {99-107},
Publisher = {The Association for Computing Machinery},
Abstract = {This paper argues that for some applications direct search for association rules can be more efficient than the two stage process of the Apriori algorithm which first finds large item sets which are then used to identify associations. In particular, it is argued, Apriori can impose large computational overheads when the number of frequent itemsets is very large. This will often be the case when association rule analysis is performed on domains other than basket analysis or when it is performed for basket analysis with basket information augmented by other customer information. An algorithm is presented that is computationally efficient for association rule analysis during which the number of rules to be found can be constrained and all data can be maintained in memory.},
Audit-trail = {*},
Keywords = {Search and OPUS and Association Rule Discovery},
Location = {Boston, MA},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT This paper argues that for some applications direct search for association rules can be more efficient than the two stage process of the Apriori algorithm which first finds large item sets which are then used to identify associations. In particular, it is argued, Apriori can impose large computational overheads when the number of frequent itemsets is very large. This will often be the case when association rule analysis is performed on domains other than basket analysis or when it is performed for basket analysis with basket information augmented by other customer information. An algorithm is presented that is computationally efficient for association rule analysis during which the number of rules to be found can be constrained and all data can be maintained in memory.

Zheng, Z., & Webb, G. I.
Machine Learning, 41(1), 53-84, 2000.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{ZhengWebb00,
author = {Zheng, Z. and Webb, G. I.},
journal = {Machine Learning},
title = {Lazy Learning of Bayesian Rules},
year = {2000},
number = {1},
pages = {53-84},
volume = {41},
abstract = {The naive Bayesian classifier provides a simple and effective approach to classifier learning, but its attribute independence assumption is often violated in the real world. A number of approaches have sought to alleviate this problem. A Bayesian tree learning algorithm builds a decision tree, and generates a local naive Bayesian classifier at each leaf. The tests leading to a leaf can alleviate attribute integra�dependencies for the local naive Bayesian classifier. However, Bayesian tree learning still suffers from the small disjunct problem of tree learning. While inferred Bayesian trees demonstrate low average prediction error rates, there is reason to believe that error rates will be higher for those leaves with few training examples. This paper proposes the application of lazy learning techniques to Bayesian tree induction and presents the resulting lazy Bayesian rule learning algorithm, called LBR. For each test example, it builds a most appropriate rule with a local naive Bayesian classifier as its consequent. It is demonstrated that the computational requirements of LBR are reasonable in a wide cross�selection of natural domains. Experiments with these domains show that, on average, this new algorithm obtains lower error rates significantly more often than the reverse in comparison to a naive Bayesian classifier, C4.5, a Bayesian tree learning algorithm, a constructive Bayesian classifier that eliminates attributes and constructs new attributes using Cartesian products of existing nominal attributes, and a lazy decision tree learning algorithm. It also outperforms, although the result is not statistically significant, a selective naive Bayesian classifier.},
audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF posted 30/10/03},
doi = {10.1023/A:1007613203719},
keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
publisher = {Springer},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT The naive Bayesian classifier provides a simple and effective approach to classifier learning, but its attribute independence assumption is often violated in the real world. A number of approaches have sought to alleviate this problem. A Bayesian tree learning algorithm builds a decision tree, and generates a local naive Bayesian classifier at each leaf. The tests leading to a leaf can alleviate attribute integra�dependencies for the local naive Bayesian classifier. However, Bayesian tree learning still suffers from the small disjunct problem of tree learning. While inferred Bayesian trees demonstrate low average prediction error rates, there is reason to believe that error rates will be higher for those leaves with few training examples. This paper proposes the application of lazy learning techniques to Bayesian tree induction and presents the resulting lazy Bayesian rule learning algorithm, called LBR. For each test example, it builds a most appropriate rule with a local naive Bayesian classifier as its consequent. It is demonstrated that the computational requirements of LBR are reasonable in a wide cross�selection of natural domains. Experiments with these domains show that, on average, this new algorithm obtains lower error rates significantly more often than the reverse in comparison to a naive Bayesian classifier, C4.5, a Bayesian tree learning algorithm, a constructive Bayesian classifier that eliminates attributes and constructs new attributes using Cartesian products of existing nominal attributes, and a lazy decision tree learning algorithm. It also outperforms, although the result is not statistically significant, a selective naive Bayesian classifier.

Smith, P. A., & Webb, G. I.
Proceedings of the Seventh International Conference on Computers in Education (ICCE '99), Amsterdam, pp. 385-392, 1999.

@InProceedings{SmithWebb99,
Title = {Evaluation of Low-Level Program Visualisation for Teaching Novice C Programmers},
Author = {Smith, P. A. and Webb, G. I.},
Booktitle = {Proceedings of the Seventh International Conference on Computers in Education (ICCE '99)},
Year = {1999},
Editor = {Cumming, G. and Okamoto, T. and Gomez, L.},
Pages = {385-392},
Publisher = {IOS Press},
Volume = {2},
Abstract = {While several program visualisation tools aimed at novice programmers have been developed over the past decade there is little empirical evidence showing that novices actually benefit from their use (Mulholland, 1995). Bradman (Smith & Webb, 1998) is a low-level program visualisation tool. We present an experiment that tests the efficacy of Bradman in assisting novice programmers learn programming concepts. We show that students with access to this lowlevel program visualisation tool achieved greater understanding of some programming concepts than those without access.},
Audit-trail = {pdf on file is early stage with corrections},
Keywords = {Program Visualisation},
Location = {Chiba, Japan},
Related = {program-visualisation}
}
ABSTRACT While several program visualisation tools aimed at novice programmers have been developed over the past decade there is little empirical evidence showing that novices actually benefit from their use (Mulholland, 1995). Bradman (Smith & Webb, 1998) is a low-level program visualisation tool. We present an experiment that tests the efficacy of Bradman in assisting novice programmers learn programming concepts. We show that students with access to this lowlevel program visualisation tool achieved greater understanding of some programming concepts than those without access.

Newlands, D., & Webb, G. I.
Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third Pacific-Asia Conference (PAKDD'99), Berlin/Heidelberg, pp. 306-316, 1999.
[Bibtex] [Abstract]  → Access on publisher site

@InProceedings{NewlandsWebb99a,
Title = {Convex Hulls in Concept Induction},
Author = {Newlands, D. and Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third {Pacific}-{Asia} Conference (PAKDD'99)},
Year = {1999},
Editor = {N. Zhong and L. Zhou},
Pages = {306-316},
Publisher = {Springer-Verlag},
Abstract = {This paper investigates modelling concepts as a few, large convex hulls rather than as many, small, axis-orthogonal divisions as is done by systems which currently dominate classification learning. It is argued that this approach produces classifiers which have less strong hypothesis language bias and which, because of the fewness of the concepts induced, are more understandable. The design of such a system is described and its performance is investigated. Convex hulls are shown to be a useful inductive generalisation technique offering rather different biases than well-known systems such as C4.5 and CN2. The types of domains where convex hulls can be usefully employed are described.},
Location = {Beijing, China},
}
ABSTRACT This paper investigates modelling concepts as a few, large convex hulls rather than as many, small, axis-orthogonal divisions as is done by systems which currently dominate classification learning. It is argued that this approach produces classifiers which have less strong hypothesis language bias and which, because of the fewness of the concepts induced, are more understandable. The design of such a system is described and its performance is investigated. Convex hulls are shown to be a useful inductive generalisation technique offering rather different biases than well-known systems such as C4.5 and CN2. The types of domains where convex hulls can be usefully employed are described.

Brain, D., & Webb, G. I.
Proceedings of the Fourth Australian Knowledge Acquisition Workshop (AKAW-99), Sydney, pp. 117-128, 1999.

@InProceedings{BrainWebb99,
Title = {On The Effect of Data Set Size on Bias And Variance in Classification Learning},
Author = {Brain, D. and Webb, G. I.},
Booktitle = {Proceedings of the Fourth {Australian} Knowledge Acquisition Workshop ({AKAW}-99)},
Year = {1999},
Editor = {Richards, D. and Beydoun, G. and Hoffmann, A. and Compton, P.},
Pages = {117-128},
Publisher = {The University of New South Wales},
Abstract = {With the advent of data mining, machine learning has come of age and is now a critical technology in many businesses. However, machine learning evolved in a different research context to that in which it now finds itself employed. A particularly important problem in the data mining world is working effectively with large data sets. However, most machine learning research has been conducted in the context of learning from very small data sets. To date most approaches to scaling up machine learning to large data sets have attempted to modify existing algorithms to deal with large data sets in a more computationally efficient and effective manner. But is this necessarily the best method? This paper explores the possibility of designing algorithms specifically for large data sets. Specifically, the paper looks at how increasing data set size affects bias and variance error decompositions for classification algorithms. Preliminary results of experiments to determine these effects are presented, showing that, as hypothesized variance can be expected to decrease as training set size increases. No clear effect of training set size on bias was observed. These results have profound implications for data mining from large data sets, indicating that developing effective learning algorithms for large data sets is not simply a matter of finding computationally efficient variants of existing learning algorithms.},
Audit-trail = {*},
Keywords = {Learning from large datasets and Bias-Variance},
Location = {Sydney, Australia},
Related = {learning-from-large-datasets}
}
ABSTRACT With the advent of data mining, machine learning has come of age and is now a critical technology in many businesses. However, machine learning evolved in a different research context to that in which it now finds itself employed. A particularly important problem in the data mining world is working effectively with large data sets. However, most machine learning research has been conducted in the context of learning from very small data sets. To date most approaches to scaling up machine learning to large data sets have attempted to modify existing algorithms to deal with large data sets in a more computationally efficient and effective manner. But is this necessarily the best method? This paper explores the possibility of designing algorithms specifically for large data sets. Specifically, the paper looks at how increasing data set size affects bias and variance error decompositions for classification algorithms. Preliminary results of experiments to determine these effects are presented, showing that, as hypothesized variance can be expected to decrease as training set size increases. No clear effect of training set size on bias was observed. These results have profound implications for data mining from large data sets, indicating that developing effective learning algorithms for large data sets is not simply a matter of finding computationally efficient variants of existing learning algorithms.

Zheng, Z., & Webb, G. I.
Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third Pacific-Asia Conference (PAKDD'99), Berlin/Heidelberg, pp. 123-132, 1999.

@InProceedings{ZhengWebb99b,
author = {Zheng, Z. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third {Pacific}-{Asia} Conference (PAKDD'99)},
title = {Stochastic Attribute Selection Committees with Multiple Boosting: Learning More Accurate and More Stable Classifier Committees},
year = {1999},
editor = {Zhong, N. and Zhou, L.},
pages = {123-132},
publisher = {Springer-Verlag},
abstract = {Classifier learning is a key technique for KDD. Approaches to learning classifier committees, including Boosting, Bagging, SASC, and SASCB, have demonstrated great success in increasing the prediction accuracy� curacy of decision trees. Boosting and Bagging create different classifiers by modifying the distribution of the training set. SASC adopts a different method. It generates committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. SASCB, a combination of Boosting and SASC, has shown the ability to further increase, on average, the prediction accuracy of decision trees. It has been found that the performance of SASCB and Boosting is more variable than that of SASC, although SASCB is more accurate than the others on average. In this paper, we present a novel method to reduce variability of SASCB and Boosting, and further increase their average accuracy. It generates multiple committees by incorporating Bagging into SASCB. As well as improving stability and average accuracy, the resulting method is amenable to parallel or distributed processing, while Boosting and SascB are not. This is an important characteristic for datamining in large datasets.},
keywords = {MultiBoosting and Boosting and Stochastic Attribute Selection committees},
location = {Beijing, China},
related = {multiboosting-and-multi-strategy-ensemble-learning},
}
ABSTRACT Classifier learning is a key technique for KDD. Approaches to learning classifier committees, including Boosting, Bagging, SASC, and SASCB, have demonstrated great success in increasing the prediction accuracy� curacy of decision trees. Boosting and Bagging create different classifiers by modifying the distribution of the training set. SASC adopts a different method. It generates committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. SASCB, a combination of Boosting and SASC, has shown the ability to further increase, on average, the prediction accuracy of decision trees. It has been found that the performance of SASCB and Boosting is more variable than that of SASC, although SASCB is more accurate than the others on average. In this paper, we present a novel method to reduce variability of SASCB and Boosting, and further increase their average accuracy. It generates multiple committees by incorporating Bagging into SASCB. As well as improving stability and average accuracy, the resulting method is amenable to parallel or distributed processing, while Boosting and SascB are not. This is an important characteristic for datamining in large datasets.

Chiu, B. C., & Webb, G. I.
Proceedings of the Seventh International Conference on Computers in Education (ICCE '99), Amsterdam, pp. 111-118, 1999.

@InProceedings{ChiuWebb99a,
author = {Chiu, B. C. and Webb, G. I.},
booktitle = {Proceedings of the Seventh International Conference on Computers in Education (ICCE '99)},
title = {Dual-Model: An Architecture for Utilizing Temporal Information in Student Modeling},
year = {1999},
editor = {Cumming, G. and Okamoto, T. and Gomez, L.},
pages = {111-118},
publisher = {IOS Press},
volume = {1},
abstract = {A modeling system may be required to predict an agent's future actions even when confronted by inadequate or contradictory relevant evidence from observations of past actions. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. This raises two issues. First, when maximizing prediction rate is preferable, what mechanisms can be employed such that a system can make more predictions without severely degrading prediction accuracy? Second, for contexts in which accuracy is of primary importance, how can we further improve prediction accuracy? A recently proposed Dual-model approach, which takes models' temporal characteristics into account, suggests a solution to the first problem, but leaves room for further improvement. This paper presents two classes of Dual-model variant. Each aims to achieve one of the above objectives. With the performance of the original system as a baseline, which does not utilize the temporal information, empirical evaluations in the domain of elementary subtraction show that one class of variant outperforms the baseline in prediction rate while the other does so in prediction accuracy, without significantly affecting other overall measures of the original performance. Keywords: Agent modeling, Student modeling, Temporal model, Decision tree.},
audit-trail = {*},
keywords = {Feature Based Modeling and User Modeling},
location = {Chiba, Japan.(Also appeared in the Proceedings of ACAI Workshop W03: Machine Learning in User Modeling, pp 46-53)},
related = {feature-based-modeling},
}
ABSTRACT A modeling system may be required to predict an agent's future actions even when confronted by inadequate or contradictory relevant evidence from observations of past actions. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. This raises two issues. First, when maximizing prediction rate is preferable, what mechanisms can be employed such that a system can make more predictions without severely degrading prediction accuracy? Second, for contexts in which accuracy is of primary importance, how can we further improve prediction accuracy? A recently proposed Dual-model approach, which takes models' temporal characteristics into account, suggests a solution to the first problem, but leaves room for further improvement. This paper presents two classes of Dual-model variant. Each aims to achieve one of the above objectives. With the performance of the original system as a baseline, which does not utilize the temporal information, empirical evaluations in the domain of elementary subtraction show that one class of variant outperforms the baseline in prediction rate while the other does so in prediction accuracy, without significantly affecting other overall measures of the original performance. Keywords: Agent modeling, Student modeling, Temporal model, Decision tree.

Ting, K. M., Zheng, Z., & Webb, G. I.
Proceedings of the Nineteenth SGES International Conference on Knowledge Based Systems and Applied Artificial Intelligence (ES'99), New York, pp. 122-131, 1999.

@InProceedings{TingZhengWebb99,
author = {Ting, K.M. and Zheng, Z. and Webb, G. I.},
booktitle = {Proceedings of the Nineteenth SGES International Conference on Knowledge Based Systems and Applied Artificial Intelligence (ES'99)},
title = {Learning Lazy Rules to Improve the Performance of Classifiers},
year = {1999},
editor = {Coenen, F. and Macintosh, A.},
pages = {122-131},
publisher = {Springer},
abstract = {Based on an earlier study on lazy Bayesian rule learning, this paper introduces a general lazy learning framework, called LAZYRULE, that begins to learn a rule only when classifying a test case. The objective of the framework is to improve the performance of a base learning algorithm. It has the potential to be used for different types of base learning algorithms. LAZYRULE performs attribute elimination and training case selection using cross-validation to generate the most appropriate rule for each test case. At the consequent of the rule, it applies the base learning algorithm on the selected training subset and the remaining attributes to construct a classifier to make a prediction. This combined action seeks to build a better performing classifier for each test case than the classifier trained using all attributes and all training cases. We show empirically that LAZYRULE improves the performances of naive Bayesian classifiers and majority vote.},
audit-trail = {*},
keywords = {Conditional Probability Estimation},
location = {Peterhouse College, Cambridge, UK},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT Based on an earlier study on lazy Bayesian rule learning, this paper introduces a general lazy learning framework, called LAZYRULE, that begins to learn a rule only when classifying a test case. The objective of the framework is to improve the performance of a base learning algorithm. It has the potential to be used for different types of base learning algorithms. LAZYRULE performs attribute elimination and training case selection using cross-validation to generate the most appropriate rule for each test case. At the consequent of the rule, it applies the base learning algorithm on the selected training subset and the remaining attributes to construct a classifier to make a prediction. This combined action seeks to build a better performing classifier for each test case than the classifier trained using all attributes and all training cases. We show empirically that LAZYRULE improves the performances of naive Bayesian classifiers and majority vote.

Webb, G. I.
Proceedings of the Sixteenth International Joint Conference on Artificial Intelligence (IJCAI 99), San Francisco, pp. 702-707, 1999.

@InProceedings{Webb99,
Title = {Decision Tree Grafting From The All Tests But One Partition},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the Sixteenth International Joint Conference on Artificial Intelligence ({IJCAI} 99)},
Year = {1999},
Editor = {Dean, T.},
Pages = {702-707},
Publisher = {Morgan Kaufmann},
Abstract = {Decision tree grafting adds nodes to an existing decision tree with the objective of reducing prediction error. A new grafting algorithm is presented that considers one set of training data only for each leaf of the initial decision tree, the set of cases that fail at most one test on the path to the leaf. This new technique is demonstrated to retain the error reduction power of the original grafting algorithm while dramatically reducing compute time and the complexity of the inferred tree. Bias/variance analysis reveal that the original grafting technique operated primarily by variance reduction while the new technique reduces both bias and variance.},
Audit-trail = {PDF posted with the permission of {IJCAI} Inc},
Keywords = {Decision Tree Learning and Decision Tree Grafting and Occams Razor},
Location = {Stockholm, Sweden},
Related = {decision-tree-grafting}
}
ABSTRACT Decision tree grafting adds nodes to an existing decision tree with the objective of reducing prediction error. A new grafting algorithm is presented that considers one set of training data only for each leaf of the initial decision tree, the set of cases that fail at most one test on the path to the leaf. This new technique is demonstrated to retain the error reduction power of the original grafting algorithm while dramatically reducing compute time and the complexity of the inferred tree. Bias/variance analysis reveal that the original grafting technique operated primarily by variance reduction while the new technique reduces both bias and variance.

Webb, G. I., Wells, J., & Zheng, Z.
Machine Learning, 35(1), 5-24, 1999.

@Article{WebbWellsZheng99,
Title = {An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition},
Author = {Webb, G. I. and Wells, J. and Zheng, Z.},
Journal = {Machine Learning},
Year = {1999},
Number = {1},
Pages = {5-24},
Volume = {35},
Abstract = {Machine learning and knowledge acquisition from experts have distinct capabilities that appear to complement one another. We report a study that demonstrates the integration of these approaches can both improve the accuracy of the developed knowledge base and reduce development time. In addition, we found that users expected the expert systems created through the integrated approach to have higher accuracy than those created without machine learning and rated the integrated approach less difficult to use. They also provided favorable evaluations of both the specific integrated software, system called The Knowledge Factory, and of the general value of machine learning for knowledge acquisition.},
Audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF Posted 30/10/03},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
Publisher = {Springer},
Related = {interactive-machine-learning}
}
ABSTRACT Machine learning and knowledge acquisition from experts have distinct capabilities that appear to complement one another. We report a study that demonstrates the integration of these approaches can both improve the accuracy of the developed knowledge base and reduce development time. In addition, we found that users expected the expert systems created through the integrated approach to have higher accuracy than those created without machine learning and rated the integrated approach less difficult to use. They also provided favorable evaluations of both the specific integrated software, system called The Knowledge Factory, and of the general value of machine learning for knowledge acquisition.

Zheng, Z., Webb, G. I., & Ting, K. M.
Proceedings of the Sixteenth International Conference on Machine Learning (ICML-99), San Francisco, pp. 493-502, 1999.

@InProceedings{ZhengWebbTing99,
author = {Zheng, Z. and Webb, G. I. and Ting, K. M.},
booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML-99)},
title = {Lazy Bayesian Rules: A Lazy Semi-Naive Bayesian Learning Technique Competitive to Boosting Decision Trees},
year = {1999},
editor = {Bratko, I. and Dzeroski, S.},
pages = {493-502},
publisher = {Morgan Kaufmann},
abstract = {LBR is a lazy semi-naive Bayesian classifier learning technique, designed to alleviate the attribute interdependence problem of naive Bayesian classification. To classify a test example, it creates a conjunctive rule that selects a most appropriate subset of training examples and induces a local naive Bayesian classifier using this subset. LBR can significantly improve the performance of the naive Bayesian classifier. A bias and variance analysis of LBR reveals that it significantly reduces the bias of naive Bayesian classification at a cost of a slight increase in variance. It is interesting to compare this lazy technique with boosting and bagging, two well-known state-of-the-art non-lazy learning techniques. Empirical comparison of LBR with boosting decision trees on discrete valued data shows that LBR has, on average, significantly lower variance and higher bias. As a result of the interaction of these effects, the average prediction error of LBR over a range of learning tasks is at a level directly comparable to boosting. LBR provides a very competitive discrete valued learning technique where error minimization is the primary concern. It is very efficient when a single classifier is to be applied to classify few cases, such as in a typical incremental learning scenario.},
keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
location = {Bled, Slovenia},
related = {learning-complex-conditional-probabilities-from-data},
}
ABSTRACT LBR is a lazy semi-naive Bayesian classifier learning technique, designed to alleviate the attribute interdependence problem of naive Bayesian classification. To classify a test example, it creates a conjunctive rule that selects a most appropriate subset of training examples and induces a local naive Bayesian classifier using this subset. LBR can significantly improve the performance of the naive Bayesian classifier. A bias and variance analysis of LBR reveals that it significantly reduces the bias of naive Bayesian classification at a cost of a slight increase in variance. It is interesting to compare this lazy technique with boosting and bagging, two well-known state-of-the-art non-lazy learning techniques. Empirical comparison of LBR with boosting decision trees on discrete valued data shows that LBR has, on average, significantly lower variance and higher bias. As a result of the interaction of these effects, the average prediction error of LBR over a range of learning tasks is at a level directly comparable to boosting. LBR provides a very competitive discrete valued learning technique where error minimization is the primary concern. It is very efficient when a single classifier is to be applied to classify few cases, such as in a typical incremental learning scenario.

Zheng, Z., & Webb, G. I.
Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 321-332, 1998.

@InProceedings{ZhengWebb98a,
author = {Zheng, Z. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
title = {Stochastic Attribute Selection Committees},
year = {1998},
editor = {Antoniou, G. and Slaney, J.K.},
pages = {321-332},
publisher = {Springer-Verlag},
abstract = {Classifier committee learning methods generate multiple classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Two such methods, Bagging and Boosting, have shown great success with decision tree learning. They create different classifiers by modifying the distribution of the training set. This paper studies a different approach: Stochastic Attribute Selection Committee learning of decision trees. It generates classifier committees by stochastically modifying the set of attributes but keeping the distribution of the training set unchanged. An empirical evaluation of a variant of this method, namely Sasc, in a representative collection of natural domains shows that the SASC method can significantly reduce the error rate of decision tree learning. On average Sasc is more accurate than Bagging and less accurate than Boosting, although a one-tailed sign�test fails to show that these differences are significant at a level of 0.05. In addition, it is found that, like Bagging, Sasc is more stable than Boosting in terms of less frequently obtaining significantly higher error rates than C4.5 and, when error is raised, producing lower error rate increases. Moreover, like Bagging, Sasc is amenable to parallel and distributed processing while Boosting is not.},
audit-trail = {*},
keywords = {MultiBoosting and Stochastic Attribute Selection Committees},
location = {Brisbane, Australia},
related = {multiboosting-and-multi-strategy-ensemble-learning},
}
ABSTRACT Classifier committee learning methods generate multiple classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Two such methods, Bagging and Boosting, have shown great success with decision tree learning. They create different classifiers by modifying the distribution of the training set. This paper studies a different approach: Stochastic Attribute Selection Committee learning of decision trees. It generates classifier committees by stochastically modifying the set of attributes but keeping the distribution of the training set unchanged. An empirical evaluation of a variant of this method, namely Sasc, in a representative collection of natural domains shows that the SASC method can significantly reduce the error rate of decision tree learning. On average Sasc is more accurate than Bagging and less accurate than Boosting, although a one-tailed sign�test fails to show that these differences are significant at a level of 0.05. In addition, it is found that, like Bagging, Sasc is more stable than Boosting in terms of less frequently obtaining significantly higher error rates than C4.5 and, when error is raised, producing lower error rate increases. Moreover, like Bagging, Sasc is amenable to parallel and distributed processing while Boosting is not.

Chiu, B. C., & Webb, G. I.
User Modeling and User-Adapted Interaction, 8(1-2), 131-152, 1998.

@Article{ChiuWebb98,
Title = {Using Decision Trees For Agent Modelling: Improving Prediction Performance},
Author = {Chiu, B. C. and Webb, G. I.},
Journal = {User Modeling and User-Adapted Interaction},
Year = {1998},
Number = {1-2},
Pages = {131-152},
Volume = {8},
Abstract = {A modeling system may be required to predict an agent�s future actions under constraints of inadequate or contradictory relevant historical evidence. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. A previous study that explored techniques for improving prediction rates in the context of modeling students� subtraction skills using Feature Based Modeling showed a tradeoff between prediction rate and predication accuracy. This paper presents research that aims to improve prediction rates without affecting prediction accuracy. The FBM-C4.5 agent modeling system was used in this research. However, the techniques explored are applicable to any Feature Based Modeling system, and the most effective technique developed is applicable to most agent modeling systems. The default FBM-C4.5 system models agents� competencies with a set of decision trees, trained on all historical data. Each tree predicts one particular aspect of the agent�s action. Predictions from multiple trees are compared for consensus. FBM-C4.5 makes no prediction when predictions from different trees contradict one another. This strategy trades off reduced prediction rates for increased accuracy. To make predictions in the absence of consensus, three techniques have been evaluated. They include using voting, using a tree quality measure and using a leaf quality measure. An alternative technique that merges multiple decision trees into a single tree provides an advantage of producing models that are more comprehensible. However, all of these techniques demonstrated the previous encountered trade-off between rate of prediction and accuracy of prediction, albeit less pronounced. It was hypothesized that models built on more current observations would outperform models built on earlier observations. Experimental results support this hypothesis. A Dual-model system, which takes this temporal factor into account, has been evaluated. This fifth approach achieved a significant improvement in prediction rate without significantly affecting prediction accuracy.},
Audit-trail = {Link via {ACM} Portal},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {Springer},
Related = {feature-based-modeling}
}
ABSTRACT A modeling system may be required to predict an agent�s future actions under constraints of inadequate or contradictory relevant historical evidence. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. A previous study that explored techniques for improving prediction rates in the context of modeling students� subtraction skills using Feature Based Modeling showed a tradeoff between prediction rate and predication accuracy. This paper presents research that aims to improve prediction rates without affecting prediction accuracy. The FBM-C4.5 agent modeling system was used in this research. However, the techniques explored are applicable to any Feature Based Modeling system, and the most effective technique developed is applicable to most agent modeling systems. The default FBM-C4.5 system models agents� competencies with a set of decision trees, trained on all historical data. Each tree predicts one particular aspect of the agent�s action. Predictions from multiple trees are compared for consensus. FBM-C4.5 makes no prediction when predictions from different trees contradict one another. This strategy trades off reduced prediction rates for increased accuracy. To make predictions in the absence of consensus, three techniques have been evaluated. They include using voting, using a tree quality measure and using a leaf quality measure. An alternative technique that merges multiple decision trees into a single tree provides an advantage of producing models that are more comprehensible. However, all of these techniques demonstrated the previous encountered trade-off between rate of prediction and accuracy of prediction, albeit less pronounced. It was hypothesized that models built on more current observations would outperform models built on earlier observations. Experimental results support this hypothesis. A Dual-model system, which takes this temporal factor into account, has been evaluated. This fifth approach achieved a significant improvement in prediction rate without significantly affecting prediction accuracy.

Zheng, Z., Webb, G. I., & Ting, K. M.
Proceedings of the Tenth IEEE International Conference on Tools with Artificial Intelligence (ICTAI-98), Los Alamitos, CA, pp. 216-223, 1998.

@InProceedings{ZhengWebbTing98,
Title = {Integrating Boosting and Stochastic Attribute Selection Committees for Further Improving The Performance of Decision Tree Learning},
Author = {Zheng, Z. and Webb, G. I. and Ting, K. M.},
Booktitle = {Proceedings of the Tenth {IEEE} International Conference on Tools with Artificial Intelligence (ICTAI-98)},
Year = {1998},
Pages = {216-223},
Publisher = {{IEEE} Computer Society Press},
Abstract = {Techniques for constructing classifier committees including boosting and bagging have demonstrated great success, especially boosting for decision tree learning. This type of technique generates several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Boosting and bagging create different classifiers by modifying the distribution of the training set. SASC (Stochastic Attribute Selection Committees) uses an alternative approach to generating classifier committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. We propose a method for improving the performance of boosting. This technique combines boosting and SASC. It builds classifier committees by manipulating both the distribution of the training set and the set of attributes available during induction. In the synergy SASC effectively increases the model diversity of boosting. Experiments with a representative collection of natural domains show that, on average, the combined technique outperforms either boosting or SASC alone in terms of reducing the error rate of decision tree learning.},
Audit-trail = {Available via Citeseer http://citeseer.ist.psu.edu/4952.html},
Keywords = {MultiBoosting and Boosting and Stochastic Attribute Selection Committees},
Location = {Taipei, Taiwan},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Techniques for constructing classifier committees including boosting and bagging have demonstrated great success, especially boosting for decision tree learning. This type of technique generates several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Boosting and bagging create different classifiers by modifying the distribution of the training set. SASC (Stochastic Attribute Selection Committees) uses an alternative approach to generating classifier committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. We propose a method for improving the performance of boosting. This technique combines boosting and SASC. It builds classifier committees by manipulating both the distribution of the training set and the set of attributes available during induction. In the synergy SASC effectively increases the model diversity of boosting. Experiments with a representative collection of natural domains show that, on average, the combined technique outperforms either boosting or SASC alone in terms of reducing the error rate of decision tree learning.

Zheng, Z., & Webb, G. I.
Proceedings of the 1998 International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'98), pp. 1133-1140, 1998.

@InProceedings{ZhengWebb98b,
author = {Zheng, Z. and Webb, G. I.},
booktitle = {Proceedings of the 1998 International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'98)},
title = {Multiple Boosting: A Combination of Boosting and Bagging},
year = {1998},
pages = {1133-1140},
publisher = {CSREA Press},
abstract = {Classifier committee learning approaches have demonstrated great success in increasing the prediction accuracy of classifier learning, which is a key technique for datamining. These approaches generate several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. It has been shown that Boosting and Bagging, as two representative methods of this type, can significantly decrease the error rate of decision tree learning. Boosting is generally more accurate than Bagging, but the former is more variable than the latter. In addition, bagging is amenable to parallel or distributed processing, while Boosting is not. In this paper, we study a new committee learning algorithm, namely MB (Multiple Boosting). It creates multiple subcommittees by combining Boosting and Bagging. Experimental results in a representative collection of natural domains show that MB is, on average, more accurate than either Bagging or Boosting alone. It is more stable than Boosting, and is amenable to parallel or distributed processing. These characters� characteristics make MB a good choice for parallel datamining� ing.},
audit-trail = {*},
keywords = {MultiBoosting},
related = {multiboosting-and-multi-strategy-ensemble-learning},
}
ABSTRACT Classifier committee learning approaches have demonstrated great success in increasing the prediction accuracy of classifier learning, which is a key technique for datamining. These approaches generate several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. It has been shown that Boosting and Bagging, as two representative methods of this type, can significantly decrease the error rate of decision tree learning. Boosting is generally more accurate than Bagging, but the former is more variable than the latter. In addition, bagging is amenable to parallel or distributed processing, while Boosting is not. In this paper, we study a new committee learning algorithm, namely MB (Multiple Boosting). It creates multiple subcommittees by combining Boosting and Bagging. Experimental results in a representative collection of natural domains show that MB is, on average, more accurate than either Bagging or Boosting alone. It is more stable than Boosting, and is amenable to parallel or distributed processing. These characters� characteristics make MB a good choice for parallel datamining� ing.

Viswanathan, M., & Webb, G. I.
Lecture Notes in Computer Science 1398: Proceedings of the Tenth European Conference on Machine Learning (ECML'98), Berlin/Heidelberg, pp. 149-159, 1998.

@InProceedings{ViswanathanWebb98,
author = {Viswanathan, M. and Webb, G. I.},
booktitle = {Lecture Notes in Computer Science 1398: Proceedings of the Tenth European Conference on Machine Learning (ECML'98)},
title = {Classification Learning Using All Rules},
year = {1998},
editor = {Nedellec, C. and Rouveiro, C.},
pages = {149-159},
publisher = {Springer},
abstract = {The covering algorithm has been ubiquitous in the induction of classification rules. This approach to machine learning uses heuristic search that seeks to find a minimum number of rules that adequately explains the data. However, recent research has provided evidence that learning redundant classifiers can increase predictive accuracy. Learning all possible classifiers seems to be a plausible form of this nomination of redundant classifiers. This paper presents an algorithm that in effect learns all classifiers. Preliminary investigations by Webb (1996b) suggest that a heuristic covering algorithm in general learns classification rules with higher predictive accuracy than those learned by this new approach. In this paper we present an extensive empirical comparison between the learning-all-rules algorithm and three varied established approaches to inductive learning, namely a covering algorithm, an instance-based learner and a decision tree learner. Empirical evaluation provides strong evidence in support of learning-all-rules as a plausible approach to inductive learning.},
audit-trail = {Springerlink not up for this volume yet.},
keywords = {Lazy Learning and Rule Learning},
location = {Chemnitz, Germany},
}
ABSTRACT The covering algorithm has been ubiquitous in the induction of classification rules. This approach to machine learning uses heuristic search that seeks to find a minimum number of rules that adequately explains the data. However, recent research has provided evidence that learning redundant classifiers can increase predictive accuracy. Learning all possible classifiers seems to be a plausible form of this nomination of redundant classifiers. This paper presents an algorithm that in effect learns all classifiers. Preliminary investigations by Webb (1996b) suggest that a heuristic covering algorithm in general learns classification rules with higher predictive accuracy than those learned by this new approach. In this paper we present an extensive empirical comparison between the learning-all-rules algorithm and three varied established approaches to inductive learning, namely a covering algorithm, an instance-based learner and a decision tree learner. Empirical evaluation provides strong evidence in support of learning-all-rules as a plausible approach to inductive learning.

Webb, G. I.
User Modeling and User-Adapted Interaction, 8(1), 1-3, 1998.

@Article{Webb98a,
author = {Webb, G. I.},
journal = {User Modeling and User-Adapted Interaction},
title = {Preface to UMUAI Special Issue on Machine Learning for User Modeling},
year = {1998},
number = {1},
pages = {1-3},
volume = {8},
audit-trail = {Link via Kluwer site},
keywords = {User Modeling},
}
ABSTRACT 

Webb, G. I., & Kuzmycz, M.
Lecture Notes in Computer Science Vol. 1452: Proceedings of the Fourth International Conference on Intelligent Tutoring Systems (ITS '98), Berlin, pp. 384-393, 1998.

@InProceedings{WebbKuzmycz98,
author = {Webb, G. I. and Kuzmycz, M.},
booktitle = {Lecture Notes in Computer Science Vol. 1452: Proceedings of the Fourth International Conference on Intelligent Tutoring Systems (ITS '98)},
title = {Evaluation Of Data Aging: A Technique For Discounting Old Data During Student Modeling},
year = {1998},
editor = {Goettl, B.P. and Halff, H. M. and Redfield, C. and Shute, V.},
pages = {384-393},
publisher = {Springer-Verlag},
abstract = {Student modeling systems must operate in an environment in which a student's mastery of a subject matter is likely to change as a lesson progresses. A student model is formed from evaluation of evidence about the student's mastery of the domain. However, given that such mastery will change, older evidence is likely to be less valuable than recent evidence. Data aging addresses this issue by discounting the value of older evidence. This paper provides experimental evaluation of the effects of data aging. While it is demonstrated that data aging can result in statistically significant increases in both the number and accuracy of predictions that a modeling system makes, it is also demonstrated that the reverse can be true. Further, the effects experienced are of only small magnitude. It is argued that these results demonstrate some potential for data aging as a general strategy, but do not warrant employing data aging in its current form.},
audit-trail = {PDF posted},
keywords = {Feature Based Modeling and User Modeling},
location = {San Antonio, Texas},
related = {feature-based-modeling},
}
ABSTRACT Student modeling systems must operate in an environment in which a student's mastery of a subject matter is likely to change as a lesson progresses. A student model is formed from evaluation of evidence about the student's mastery of the domain. However, given that such mastery will change, older evidence is likely to be less valuable than recent evidence. Data aging addresses this issue by discounting the value of older evidence. This paper provides experimental evaluation of the effects of data aging. While it is demonstrated that data aging can result in statistically significant increases in both the number and accuracy of predictions that a modeling system makes, it is also demonstrated that the reverse can be true. Further, the effects experienced are of only small magnitude. It is argued that these results demonstrate some potential for data aging as a general strategy, but do not warrant employing data aging in its current form.

Webb, G. I., & Pazzani, M.
Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 285-295, 1998.

@InProceedings{WebbPazzani98,
Title = {Adjusted Probability Naive Bayesian Induction},
Author = {Webb, G. I. and Pazzani, M.},
Booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
Year = {1998},
Editor = {Antoniou, G. and Slaney, J.K.},
Pages = {285-295},
Publisher = {Springer-Verlag},
Abstract = {Naive Bayesian classifiers utilise a simple mathematical model for induction. While it is known that the assumptions on which this model is based are frequently violated, the predictive accuracy obtained in discriminate classification tasks is surprisingly competitive in comparison to more complex induction techniques. Adjusted probability naive Bayesian induction adds a simple extension to the naive Bayesian classifier. A numeric weight is inferred for each class. During discriminate classification, the naive Bayesian probability of a class is multiplied by its weight to obtain an adjusted value. The use of this adjusted value in place of the naive Bayesian probability is shown to significantly improve predictive accuracy.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation and Bayesian Learning},
Location = {Brisbane, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Naive Bayesian classifiers utilise a simple mathematical model for induction. While it is known that the assumptions on which this model is based are frequently violated, the predictive accuracy obtained in discriminate classification tasks is surprisingly competitive in comparison to more complex induction techniques. Adjusted probability naive Bayesian induction adds a simple extension to the naive Bayesian classifier. A numeric weight is inferred for each class. During discriminate classification, the naive Bayesian probability of a class is multiplied by its weight to obtain an adjusted value. The use of this adjusted value in place of the naive Bayesian probability is shown to significantly improve predictive accuracy.

Webb, G. I.
Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 273-283, 1998.

@InProceedings{Webb98,
Title = {The Problem of Missing Values in Decision Tree Grafting},
Author = {Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
Year = {1998},
Editor = {Antoniou, G. and Slaney, J.K.},
Pages = {273-283},
Publisher = {Springer-Verlag},
Abstract = {Decision tree grafting adds nodes to inferred decision trees. Previous research has demonstrated that appropriate grafting techniques can improve predictive accuracy across a wide cross�selection of domains. However, previous decision tree grafting systems are demonstrated to have a serious deficiency for some data sets containing missing values. This problem arises due to the method for handling missing values employed by C4.5, in which the grafting systems have been embedded. This paper provides an explanation of and solution to the problem. Experimental evidence is presented of the efficacy of this solution.},
Audit-trail = {*},
Keywords = {Decision Tree Learning and Decision Tree Grafting and Occams Razor},
Location = {Brisbane, Australia},
Related = {decision-tree-grafting}
}
ABSTRACT Decision tree grafting adds nodes to inferred decision trees. Previous research has demonstrated that appropriate grafting techniques can improve predictive accuracy across a wide cross�selection of domains. However, previous decision tree grafting systems are demonstrated to have a serious deficiency for some data sets containing missing values. This problem arises due to the method for handling missing values employed by C4.5, in which the grafting systems have been embedded. This paper provides an explanation of and solution to the problem. Experimental evidence is presented of the efficacy of this solution.

Smith, P. A., & Webb, G. I.
Proceedings of the Sixth International Conference on Computers in Education (ICCE '98), Berlin, pp. 213-216, 1998.

@InProceedings{SmithWebb98,
Title = {Overview of a Low-Level Program Visualisation Tool for Novice Programmers},
Author = {Smith, P. A. and Webb, G. I.},
Booktitle = {Proceedings of the Sixth International Conference on Computers in Education (ICCE '98)},
Year = {1998},
Pages = {213-216},
Publisher = {Springer-Verlag},
Abstract = {As a programming novice attempts to attain expertise in programming she must develop adequate mental models and knowledge structures of the programming process. Unfortunately, many of the computerised tools to which novice programmers have access are designed by expert programmers for experts and as such do not meet the needs of novices. Low-level program visualisation tools make explicit the internal workings of program execution and as such can serve as conceptual models onto which novices can assimilate information about programming. This paper discusses the need for such a tool, what features such a tool may include and gives a brief description of an evaluation of a low-level program visualisation tool developed at Deakin University.},
Audit-trail = {Reconstructed paper posted Nov 05},
Keywords = {Program Visualisation},
Location = {Beijing},
Related = {program-visualisation}
}
ABSTRACT As a programming novice attempts to attain expertise in programming she must develop adequate mental models and knowledge structures of the programming process. Unfortunately, many of the computerised tools to which novice programmers have access are designed by expert programmers for experts and as such do not meet the needs of novices. Low-level program visualisation tools make explicit the internal workings of program execution and as such can serve as conceptual models onto which novices can assimilate information about programming. This paper discusses the need for such a tool, what features such a tool may include and gives a brief description of an evaluation of a low-level program visualisation tool developed at Deakin University.

Webb, G. I.
Proceedings of the Fifteenth International Joint Conference on Artificial Intelligence (IJCAI 97), San Francisco, pp. 846-851, 1997.

@InProceedings{Webb97,
Title = {Decision Tree Grafting},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the Fifteenth International Joint Conference on Artificial Intelligence ({IJCAI} 97)},
Year = {1997},
Pages = {846-851},
Publisher = {Morgan Kaufmann},
Abstract = {This paper extends recent work on decision tree grafting. Grafting is an inductive process that adds nodes to inferred decision trees. This process is demonstrated to frequently improve predictive accuracy. Superficial analysis might suggest that decision tree grafting is the direct reverse of pruning. To the contrary, it is argued that the two processes are complementary. This is because, like standard tree growing techniques, pruning uses only local information, whereas grafting uses non-local information. The use of both pruning and grafting in conjunction is demonstrated to provide the best general predictive accuracy over a representative selection of learning tasks.},
Audit-trail = {PDF posted with the permission of IJCAI Inc},
Keywords = {Decision Trees and Decision Tree Grafting and Occams Razor},
Location = {Nagoya, Japan},
Related = {decision-tree-grafting}
}
ABSTRACT This paper extends recent work on decision tree grafting. Grafting is an inductive process that adds nodes to inferred decision trees. This process is demonstrated to frequently improve predictive accuracy. Superficial analysis might suggest that decision tree grafting is the direct reverse of pruning. To the contrary, it is argued that the two processes are complementary. This is because, like standard tree growing techniques, pruning uses only local information, whereas grafting uses non-local information. The use of both pruning and grafting in conjunction is demonstrated to provide the best general predictive accuracy over a representative selection of learning tasks.

Chiu, B. C., Webb, G. I., & Kuzmycz, M.
Proceedings of the Sixth International Conference on User Modeling (UM'97), New York/Vienna, pp. 347-358, 1997.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@InProceedings{ChiuWebbKuzmycz97,
author = {Chiu, B. C. and Webb, G. I. and Kuzmycz, M.},
booktitle = {Proceedings of the Sixth International Conference on User Modeling (UM'97)},
title = {A Comparison of First-Order and Zeroth-Order Induction for Input-Output Agent Modelling},
year = {1997},
editor = {Jameson, A. and Paris, C. and Tasso, C.},
pages = {347-358},
publisher = {Springer},
abstract = {Most student modelling systems seek to develop a model of the internal operation of the cognitive system. In contrast, Input-Output Agent Modelling (IOAM) models an agent in terms of relationships between the inputs and outputs of the cognitive system. Previous IOAM systems have demonstrated high predictive accuracy in the domain of elementary subtraction. These systems use zeroth-order induction. Many of the predicates used, however, represent relations. This suggests that first-order induction might perform well in this domain. This paper reports a study in which zeroth-order and first-order induction engines were used to build models of student subtraction skills. Comparative evaluation shows that zeroth-order induction performs better than first-order in detecting regularities indicating misconceptions while first-order induction leads zeroth-order in detecting regularities indicating correct concepts and inducing a more comprehensible student model. This suggests there exists a trade-off between these factors and that there is still scope for improvement.},
audit-trail = {*},
doi = {10.1007/978-3-7091-2670-7_35},
keywords = {Feature Based Modeling and User Modeling},
location = {Chia Laguna, Sardinia},
related = {feature-based-modeling},
}
ABSTRACT Most student modelling systems seek to develop a model of the internal operation of the cognitive system. In contrast, Input-Output Agent Modelling (IOAM) models an agent in terms of relationships between the inputs and outputs of the cognitive system. Previous IOAM systems have demonstrated high predictive accuracy in the domain of elementary subtraction. These systems use zeroth-order induction. Many of the predicates used, however, represent relations. This suggests that first-order induction might perform well in this domain. This paper reports a study in which zeroth-order and first-order induction engines were used to build models of student subtraction skills. Comparative evaluation shows that zeroth-order induction performs better than first-order in detecting regularities indicating misconceptions while first-order induction leads zeroth-order in detecting regularities indicating correct concepts and inducing a more comprehensible student model. This suggests there exists a trade-off between these factors and that there is still scope for improvement.

Chiu, B. C., & Webb, G. I.
Proceedings (on-line) of The First Machine Learning for User Modeling Workshop (UM'97), 1997.
[Bibtex]  → Download PDF  → Access on publisher site  → Related papers and software

@InProceedings{ChiuWebb97,
author = {Chiu, B. C. and Webb, G. I.},
booktitle = {Proceedings (on-line) of The First Machine Learning for User Modeling Workshop (UM'97)},
title = {Using C4.5 as an Induction Engine for Agent Modeling: An Experiment of Optimisation},
year = {1997},
doi = {10.1023/A:1008296930163},
keywords = {Feature Based Modeling and User Modeling},
location = {Chia Laguna, Sardinia},
related = {feature-based-modeling},
}
ABSTRACT 

Webb, G. I., Chiu, B. C., & Kuzmycz, M.
International Journal of Artificial Intelligence in Education, 8, 97-115, 1997.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{WebbChiuKuzmycz97,
Title = {Comparative Evaluation of Alternative Induction Engines for Feature Based Modelling},
Author = {Webb, G. I. and Chiu, B. C. and Kuzmycz, M.},
Journal = {International Journal of Artificial Intelligence in Education},
Year = {1997},
Pages = {97-115},
Volume = {8},
Abstract = {Feature Based Modelling has demonstrated the ability to produce agent models with high accuracy in predicting an agent's future actions. There are a number of respects in which this modelling technique is novel. However, there has been no previous analysis of which aspects of the approach are responsible for its performance. One distinctive feature of the approach is a purpose built induction module. This paper presents a study in which the original custom built Feature Based Modelling induction module was replaced by the C4.5 machine learning system. Comparative evaluation shows that the use of C4.5 increases the number of predictions made without significantly altering the accuracy of those predictions. This suggests that it is the general input-output agent modelling methodology used with both systems that has primary responsibility for the high predictive accuracy previously reported for Feature Based Modelling, rather than its initial idiosyncratic induction technique.},
Audit-trail = {Link via IJAIED site},
Doi = {10.1.1.36.3545},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {IOS Press},
Related = {feature-based-modeling}
}
ABSTRACT Feature Based Modelling has demonstrated the ability to produce agent models with high accuracy in predicting an agent's future actions. There are a number of respects in which this modelling technique is novel. However, there has been no previous analysis of which aspects of the approach are responsible for its performance. One distinctive feature of the approach is a purpose built induction module. This paper presents a study in which the original custom built Feature Based Modelling induction module was replaced by the C4.5 machine learning system. Comparative evaluation shows that the use of C4.5 increases the number of predictions made without significantly altering the accuracy of those predictions. This suggests that it is the general input-output agent modelling methodology used with both systems that has primary responsibility for the high predictive accuracy previously reported for Feature Based Modelling, rather than its initial idiosyncratic induction technique.

Chiu, B. C., Webb, G. I., & Zheng, Z.
Lecture Notes in Computer Science Vol. 1342: Proceedings of the Tenth Australian Joint Conference on Artificial Intelligence (AI'97), Berlin, pp. 349-358, 1997.

@InProceedings{ChiuWebbZheng97,
Title = {Using Decision Trees for Agent Modelling: A Study on Resolving Conflicting Predictions},
Author = {Chiu, B. C. and Webb, G. I. and Zheng, Z.},
Booktitle = {Lecture Notes in Computer Science Vol. 1342: Proceedings of the Tenth Australian Joint Conference on Artificial Intelligence (AI'97)},
Year = {1997},
Editor = {Sattar, A.},
Pages = {349-358},
Publisher = {Springer-Verlag},
Abstract = {Input-Output Agent Modelling (IOAM) is an approach to modelling an agent in terms of relationships between the inputs and outputs of the cognitive system. This approach, together with a leading inductive learning algorithm, C4.5, has been adopted to build a subtraction skill modeller, C4.5-IOAM. It models agents' competencies with a set of decision trees. C4.5-IOAM makes no prediction when predictions from different decision trees are contradictory. This paper proposes three techniques for resolving such situations. Two techniques involve selecting the more reliable prediction from a set of competing predictions using a free quality measure and a leaf quality measure. The other technique merges multiple decision trees into a single tree. This has the additional advantage of producing more comprehensible models. Experimental results, in the domain of modelling elementary subtraction skills, showed that the tree quality and the leaf quality of a decision path provided valuable references for resolving contradicting predictions and a single tree model representation performed nearly equally well to the multi-tree model representation.},
Audit-trail = {Reconstructed paper posted 11/10/05},
Keywords = {Feature Based Modeling and User Modeling},
Location = {Perth, Australia},
Related = {feature-based-modeling}
}
ABSTRACT Input-Output Agent Modelling (IOAM) is an approach to modelling an agent in terms of relationships between the inputs and outputs of the cognitive system. This approach, together with a leading inductive learning algorithm, C4.5, has been adopted to build a subtraction skill modeller, C4.5-IOAM. It models agents' competencies with a set of decision trees. C4.5-IOAM makes no prediction when predictions from different decision trees are contradictory. This paper proposes three techniques for resolving such situations. Two techniques involve selecting the more reliable prediction from a set of competing predictions using a free quality measure and a leaf quality measure. The other technique merges multiple decision trees into a single tree. This has the additional advantage of producing more comprehensible models. Experimental results, in the domain of modelling elementary subtraction skills, showed that the tree quality and the leaf quality of a decision path provided valuable references for resolving contradicting predictions and a single tree model representation performed nearly equally well to the multi-tree model representation.

Webb, G. I.
Knowledge-Based Systems, 9, 253-266, 1996.

@Article{Webb96a,
Title = {Integrating Machine Learning With Knowledge Acquisition Through Direct Interaction With Domain Experts},
Author = {Webb, G. I.},
Journal = {Knowledge-Based Systems},
Year = {1996},
Pages = {253-266},
Volume = {9},
Abstract = {Knowledge elicitation from experts and empirical machine learning are two distinct approaches to knowledge acquisition with differing and mutually complementary capabilities. Learning apprentices have provided environments in which a knowledge engineer may collaborate with a machine learning system allowing, for a synergy between the complementary approaches. The Knowledge Factory is a knowledge acquisition environment that allows a domain expert to collaborate directly with a machine learning system without the need for assistance from a knowledge engineer. This requires a different form of environment to the learning apprentice. This paper describes techniques for supporting such interactions and their implementation in a knowledge acquisition environment called The Knowledge Factory.},
Audit-trail = {Link via Science Direct},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
Publisher = {Elsevier},
Related = {interactive-machine-learning}
}
ABSTRACT Knowledge elicitation from experts and empirical machine learning are two distinct approaches to knowledge acquisition with differing and mutually complementary capabilities. Learning apprentices have provided environments in which a knowledge engineer may collaborate with a machine learning system allowing, for a synergy between the complementary approaches. The Knowledge Factory is a knowledge acquisition environment that allows a domain expert to collaborate directly with a machine learning system without the need for assistance from a knowledge engineer. This requires a different form of environment to the learning apprentice. This paper describes techniques for supporting such interactions and their implementation in a knowledge acquisition environment called The Knowledge Factory.

Webb, G. I.
Journal of Artificial Intelligence Research, 4, 397-417, 1996.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb96b,
Title = {Further Experimental Evidence Against The Utility Of Occam's Razor},
Author = {Webb, G. I.},
Journal = {Journal of Artificial Intelligence Research},
Year = {1996},
Pages = {397-417},
Volume = {4},
Abstract = {This paper presents new experimental evidence against the utility of Occam's razor. A systematic procedure is presented for post-processing decision trees produced by C4.5. This procedure was derived by rejecting Occam's razor and instead attending to the assumption that similar objects are likely to belong to the same class. It increases a decision tree's complexity without altering the performance of that tree on the training data from which it is inferred. The resulting more complex decision trees are demonstrated to have, on average, for a variety of common learning tasks, higher predictive accuracy than the less complex original decision trees. This result raises considerable doubt about the utility of Occam's razor as it is commonly applied in modern machine learning.},
Audit-trail = {Link to paper via JAIR website},
Doi = {10.1613/jair.228},
Keywords = {Decision Trees and Decision Tree Grafting and Occams Razor},
Publisher = {AAAI Press},
Related = {occams-razor-in-machine-learning}
}
ABSTRACT This paper presents new experimental evidence against the utility of Occam's razor. A systematic procedure is presented for post-processing decision trees produced by C4.5. This procedure was derived by rejecting Occam's razor and instead attending to the assumption that similar objects are likely to belong to the same class. It increases a decision tree's complexity without altering the performance of that tree on the training data from which it is inferred. The resulting more complex decision trees are demonstrated to have, on average, for a variety of common learning tasks, higher predictive accuracy than the less complex original decision trees. This result raises considerable doubt about the utility of Occam's razor as it is commonly applied in modern machine learning.

Webb, G. I.
Proceedings of Information, Statistics and Induction in Science (ISIS '96), Singapore, pp. 20-30, 1996.

@InProceedings{Webb96c,
Title = {A Heuristic Covering Algorithm Outperforms Learning All Rules},
Author = {Webb, G. I.},
Booktitle = {Proceedings of Information, Statistics and Induction in Science (ISIS '96)},
Year = {1996},
Pages = {20-30},
Publisher = {World Scientific},
Abstract = {The induction of classification rules has been dominated by a single generic technique-the covering algorithm. This approach employs a simple hill-climbing search to learn sets of rules. Such search is subject to numerous widely known deficiencies. Further, there is a growing body of evidence that learning redundant sets of rules can improve predictive accuracy. The ultimate end-point of a move toward learning redundant rule sets would appear to be to learn and employ all possible rules. This paper presents a learning system that does this. An empirical investigation shows that, while the approach often achieves higher predictive accuracy than a covering algorithm, the covering algorithm outperforms induction of all rules significantly more frequently. Preliminary analysis suggests that learning all rules performs well when the training set clearly defines the decision surfaces but that the heuristic covering algorithm performs better when the decision surfaces are not clearly delineated by the training examples.},
Audit-trail = {*},
Keywords = {Lazy Learning and Rule Learning},
Location = {Melbourne, Australia}
}
ABSTRACT The induction of classification rules has been dominated by a single generic technique-the covering algorithm. This approach employs a simple hill-climbing search to learn sets of rules. Such search is subject to numerous widely known deficiencies. Further, there is a growing body of evidence that learning redundant sets of rules can improve predictive accuracy. The ultimate end-point of a move toward learning redundant rule sets would appear to be to learn and employ all possible rules. This paper presents a learning system that does this. An empirical investigation shows that, while the approach often achieves higher predictive accuracy than a covering algorithm, the covering algorithm outperforms induction of all rules significantly more frequently. Preliminary analysis suggests that learning all rules performs well when the training set clearly defines the decision surfaces but that the heuristic covering algorithm performs better when the decision surfaces are not clearly delineated by the training examples.

Webb, G. I., & Kuzmycz, M.
User Modeling and User-Adapted Interaction, 5(2), 117-150, 1996.

@Article{WebbKuzmycz96,
author = {Webb, G. I. and Kuzmycz, M.},
journal = {User Modeling and User-Adapted Interaction},
title = {Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competencies},
year = {1996},
number = {2},
pages = {117-150},
volume = {5},
abstract = {Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90% when predicting student solutions. It also demonstrates the ability to identify and model student's buggy arithmetic procedures.},
audit-trail = {Kluwer Online publications only available from Dec 1997 onwards. Not found via {ACM} Portal},
keywords = {Feature Based Modeling},
publisher = {Springer},
related = {feature-based-modeling},
}
ABSTRACT Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90% when predicting student solutions. It also demonstrates the ability to identify and model student's buggy arithmetic procedures.

Webb, G. I.
Lecture Notes in Computer Science Vol. 1114. Topics in Artificial Intelligence: Proceedings of the Fourth Pacific Rim International Conference on Artificial Intelligence (PRICAI'96), Berlin/Heidelberg, pp. 23-34, 1996.

@InProceedings{Webb96d,
Title = {Cost Sensitive Specialisation},
Author = {Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science Vol. 1114. Topics in Artificial Intelligence: Proceedings of the Fourth {Pacific} Rim International Conference on Artificial Intelligence (PRICAI'96)},
Year = {1996},
Editor = {Foo, N.Y. and Goebel, R.},
Pages = {23-34},
Publisher = {Springer-Verlag},
Abstract = {Cost-sensitive specialization is a generic technique for misclassification cost sensitive induction. This technique involves specializing aspects of a classifier associated with high misclassification costs and generalizing those associated with low misclassification costs. It is widely applicable and simple to implement. It could be used to augment the effect of standard cost-sensitive induction techniques. It should directly extend to test application cost sensitive induction tasks. Experimental evaluation demonstrates consistent positive effects over a range of misclassification cost sensitive learning tasks.},
Audit-trail = {*},
Keywords = {Cost Sensitive Learning and Generality},
Location = {Cairns, Australia},
Related = {generality-is-predictive-of-prediction-accuracy}
}
ABSTRACT Cost-sensitive specialization is a generic technique for misclassification cost sensitive induction. This technique involves specializing aspects of a classifier associated with high misclassification costs and generalizing those associated with low misclassification costs. It is widely applicable and simple to implement. It could be used to augment the effect of standard cost-sensitive induction techniques. It should directly extend to test application cost sensitive induction tasks. Experimental evaluation demonstrates consistent positive effects over a range of misclassification cost sensitive learning tasks.

Webb, G. I., & Wells, J.
Proceedings of the 1996 Pacific Knowledge Acquisition Workshop (PKAW'96), Sydney, pp. 170-189, 1996.

@InProceedings{WebbWells96,
author = {Webb, G. I. and Wells, J.},
booktitle = {Proceedings of the 1996 {Pacific} Knowledge Acquisition Workshop (PKAW'96)},
title = {An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition Through Direct Interaction with Domain Experts},
year = {1996},
editor = {Compton, P. and Mizoguchi, R. and Motada, H. and Menzies, T.},
pages = {170-189},
publisher = {UNSW Press},
abstract = {Machine learning and knowledge acquisition from experts have distinct and apparently complementary knowledge acquisition capabilities. This study demonstrates that the integration of these approaches can both improve the accuracy of the knowledge base that is developed and reduce the time taken to develop it. The system studied, called The Knowledge Factory is distinguished by the manner in which it supports direct interaction with domain experts with little or no knowledge engineering expertise. The benefits reported relate to use by such users. In addition to the improved quality of the knowledge base, in questionnaire responses the users provided favourable evaluations of the integration of machine learning with knowledge acquisition within the system.},
audit-trail = {Reconstructed paper posted April 2006},
keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
location = {Coogee, Sydney, Australia},
related = {interactive-machine-learning},
}
ABSTRACT Machine learning and knowledge acquisition from experts have distinct and apparently complementary knowledge acquisition capabilities. This study demonstrates that the integration of these approaches can both improve the accuracy of the knowledge base that is developed and reduce the time taken to develop it. The system studied, called The Knowledge Factory is distinguished by the manner in which it supports direct interaction with domain experts with little or no knowledge engineering expertise. The benefits reported relate to use by such users. In addition to the improved quality of the knowledge base, in questionnaire responses the users provided favourable evaluations of the integration of machine learning with knowledge acquisition within the system.

Webb, G. I.
Australian Computer Science Communications Vol. 18 (1): Proceedings of the Nineteenth Australasian Computer Science Conference (ACSC'96), Melbourne, pp. 1-10, 1996.

@InProceedings{Webb96e,
Title = {Inclusive Pruning: A New Class of Pruning Rule for Unordered Search and its Application to Classification Learning},
Author = {Webb, G. I.},
Booktitle = {Australian Computer Science Communications Vol. 18 (1): Proceedings of the Nineteenth Australasian Computer Science Conference (ACSC'96)},
Year = {1996},
Editor = {Ramamohanarao, K.},
Pages = {1-10},
Publisher = {ACS},
Abstract = {This paper presents a new class of pruning rule for unordered search. Previous pruning rules for unordered search identify operators that should not be applied in order to prune nodes reached via those operators. In contrast, the new pruning rules identify operators that should be applied and prune nodes that are not reached via those operators. Specific pruning rules employing both these approaches are identified for classification learning. Experimental results demonstrate that application of the new pruning rules can reduce by more than 60% the number of states from the search space that are considered during classification learning.},
Audit-trail = {*},
Keywords = {Search and Rule Learning and OPUS and Association Rule Discovery},
Location = {Royal Melbourne Insitute of Technology, Australia},
Related = {opus-search}
}
ABSTRACT This paper presents a new class of pruning rule for unordered search. Previous pruning rules for unordered search identify operators that should not be applied in order to prune nodes reached via those operators. In contrast, the new pruning rules identify operators that should be applied and prune nodes that are not reached via those operators. Specific pruning rules employing both these approaches are identified for classification learning. Experimental results demonstrate that application of the new pruning rules can reduce by more than 60% the number of states from the search space that are considered during classification learning.

Newlands, D., & Webb, G. I.
Proceedings of the Eighth International Conference on Industrial and Engineering Applications of Artificial Intelligence and Expert Systems (IEA/AIE '95), Newark, NJ, USA, pp. 587-592, 1995.

@InProceedings{NewlandsWebb95,
Title = {Polygonal Inductive Generalisation System},
Author = {Newlands, D. and Webb, G. I.},
Booktitle = {Proceedings of the Eighth International Conference on Industrial and Engineering Applications of Artificial Intelligence and Expert Systems (IEA/AIE '95)},
Year = {1995},
Editor = {G. Forsyth and M. Ali},
Pages = {587-592},
Publisher = {Gordon and Breach Science Publishers, Inc},
Abstract = {Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.},
Audit-trail = {*},
Keywords = {Convex Hulls},
Location = {Melbourne, Australia}
}
ABSTRACT Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.

Webb, G. I.
Journal of Artificial Intelligence Research, 3, 431-465, 1995.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{Webb95,
Title = {OPUS: An Efficient Admissible Algorithm For Unordered Search},
Author = {Webb, G. I.},
Journal = {Journal of Artificial Intelligence Research},
Year = {1995},
Pages = {431-465},
Volume = {3},
Abstract = {OPUS is a branch and bound search algorithm that enables efficient admissible search through spaces for which the order of search operator application is not significant. The algorithm's search efficiency is demonstrated with respect to very large machine learning search spaces. The use of admissible search is of potential value to the machine learning community as it means that the exact learning biases to be employed for complex learning tasks can be precisely specified and manipulated. OPUS also has potential for application in other areas of artificial intelligence, notably, truth maintenance.},
Audit-trail = {Link to paper via JAIR website},
Keywords = {Search and Rule Learning and OPUS and Association Rule Discovery},
Publisher = {AAAI Press},
Related = {opus-search},
Url = {http://dx.doi.org/10.1613/jair.227}
}
ABSTRACT OPUS is a branch and bound search algorithm that enables efficient admissible search through spaces for which the order of search operator application is not significant. The algorithm's search efficiency is demonstrated with respect to very large machine learning search spaces. The use of admissible search is of potential value to the machine learning community as it means that the exact learning biases to be employed for complex learning tasks can be precisely specified and manipulated. OPUS also has potential for application in other areas of artificial intelligence, notably, truth maintenance.

Smith, P. A., & Webb, G. I.
Proceedings of the Second International Workshop on Automated and Algorithmic Debugging (AADEBUG'95), 1995.

@InProceedings{SmithWebb95a,
Title = {Transparency Debugging with Explanations for Novice Programmers},
Author = {Smith, P. A. and Webb, G. I.},
Booktitle = {Proceedings of the Second International Workshop on Automated and Algorithmic Debugging (AADEBUG'95)},
Year = {1995},
Editor = {Ducassa, M.},
Publisher = {IRISA-CNRS},
Abstract = {Novice programmers often find programming to be a difficult and frustrating task. Because of their lack of experience in programming novices have different needs to experts when it comes to debugging assistants. One way a debugging assistant could be tailored to novices, as proposed by Eisenstadt, is to provide them with an explicit model of how their program works and, hence encourage them to find errors for themselves. We discuss such a transparency debugger, Bradman, that we have been developing to assist novice programmers understand and debug their C programs. We also present the results of an experiment, conducted on volunteer novice programmers, in which approximately half of the students had access to an explanation of each statement as it was executed and the other half did not. We show that access to such explanations provided beneficial results for a significant number of students.},
Keywords = {Program Visualisation},
Location = {Saint-Malo, France},
Related = {program-visualisation}
}
ABSTRACT Novice programmers often find programming to be a difficult and frustrating task. Because of their lack of experience in programming novices have different needs to experts when it comes to debugging assistants. One way a debugging assistant could be tailored to novices, as proposed by Eisenstadt, is to provide them with an explicit model of how their program works and, hence encourage them to find errors for themselves. We discuss such a transparency debugger, Bradman, that we have been developing to assist novice programmers understand and debug their C programs. We also present the results of an experiment, conducted on volunteer novice programmers, in which approximately half of the students had access to an explanation of each statement as it was executed and the other half did not. We show that access to such explanations provided beneficial results for a significant number of students.

Smith, P. A., & Webb, G. I.
Proceedings of the Seventh Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '95), Melbourne, 1995.

@InProceedings{SmithWebb95b,
Title = {Reinforcing a Generic Computer Model for Novice Programmers},
Author = {Smith, P. A. and Webb, G. I.},
Booktitle = {Proceedings of the Seventh Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '95)},
Year = {1995},
Publisher = {ASCILITE},
Abstract = {Novices often find learning their first programming language to be a frustrating and difficult process. They have difficulties in developing and debugging their programs. One of their problems is that their mental model of how the computer works is inadequate. In this paper we discuss a programming assistant, called Bradman, which we are currently developing. It is aimed at novice programmers and designed to reinforce a concrete mental model of how the computer works as a program is executed. It shows explicitly how program states change as statements in the procedural language C are executed. It does this by means of graphical display together with contextualised verbal explanations of each statement.},
Keywords = {Program Visualisation},
Location = {Melbourne, Australia},
Related = {program-visualisation}
}
ABSTRACT Novices often find learning their first programming language to be a frustrating and difficult process. They have difficulties in developing and debugging their programs. One of their problems is that their mental model of how the computer works is inadequate. In this paper we discuss a programming assistant, called Bradman, which we are currently developing. It is aimed at novice programmers and designed to reinforce a concrete mental model of how the computer works as a program is executed. It shows explicitly how program states change as statements in the procedural language C are executed. It does this by means of graphical display together with contextualised verbal explanations of each statement.

Webb, G. I., & Wells, J.
Proceedings of the Eighth Australian Joint Conference on Artificial Intelligence (AI'95), Singapore, pp. 291-298, 1995.

@InProceedings{WebbWells95,
Title = {Recent Progress in Machine-Expert Collaboration for Knowledge Acquisition},
Author = {Webb, G. I. and Wells, J.},
Booktitle = {Proceedings of the Eighth Australian Joint Conference on Artificial Intelligence (AI'95)},
Year = {1995},
Editor = {Yao, X.},
Pages = {291-298},
Publisher = {World Scientific},
Abstract = {Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.},
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Canberra, Australia},
Related = {interactive-machine-learning}
}
ABSTRACT Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.

Webb, G. I.
Proceedings of the Second Singapore International Conference on Intelligent Systems (SPICIS-94), Singapore, pp. 280-285, 1994.

@InProceedings{Webb94a,
author = {Webb, G. I.},
booktitle = {Proceedings of the Second Singapore International Conference on Intelligent Systems (SPICIS-94)},
title = {Recent Progress in Learning Decision Lists by Prepending Inferred Rules},
year = {1994},
pages = {280-285},
publisher = {{Asia} Computer Weekly},
volume = {B},
abstract = {This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to the front of the list under construction. By contrast, the classic algorithm operates by appending successive rules to the end of the decision list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy in less time than the classic algorithm.},
audit-trail = {*},
keywords = {Rule Learning and Prepend},
location = {Singapore},
related = {prepending},
}
ABSTRACT This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to the front of the list under construction. By contrast, the classic algorithm operates by appending successive rules to the end of the decision list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy in less time than the classic algorithm.

Webb, G. I.
Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94), Singapore, pp. 60-67, 1994.

@InProceedings{Webb94b,
Title = {Generality Is More Significant Then Complexity: Toward An Alternative To Occams Razor},
Author = {Webb, G. I.},
Booktitle = {Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94)},
Year = {1994},
Editor = {Zhang, C. and Debenham, J. and Lukose, D.},
Pages = {60-67},
Publisher = {World Scientific},
Abstract = {Occam's Razor is widely employed in machine learning to select between classifiers with equal empirical support. This paper presents the theorem of decreasing inductive power: that, all other things being equal, if two classifiers a and b cover identical cases from the training set and a is a generalisation of b, a has higher probability than b of misclassifying a previously unsighted case. This theorem suggests that, to the contrary of Occam's Razor, generality, not complexity, should be used to select between classifiers with equal empirical support. Two studies are presented. The first study demonstrates that the theorem of decreasing inductive power holds for a number of commonly studied learning problems and for a number of different means of manipulating classifier generality. The second study demonstrates that generality provides a more consistent indicator of predictive accuracy in the context of a default rule than does complexity. These results suggest that the theorem of decreasing predictive power provides a suitable theoretical framework for the development of learning biases for use in selecting between classifiers with identical empirical support},
Audit-trail = {*},
Keywords = {Occams Razor and Rule Learning and Generality},
Location = {Armidale,NSW, Australia},
Related = {occams-razor-in-machine-learning}
}
ABSTRACT Occam's Razor is widely employed in machine learning to select between classifiers with equal empirical support. This paper presents the theorem of decreasing inductive power: that, all other things being equal, if two classifiers a and b cover identical cases from the training set and a is a generalisation of b, a has higher probability than b of misclassifying a previously unsighted case. This theorem suggests that, to the contrary of Occam's Razor, generality, not complexity, should be used to select between classifiers with equal empirical support. Two studies are presented. The first study demonstrates that the theorem of decreasing inductive power holds for a number of commonly studied learning problems and for a number of different means of manipulating classifier generality. The second study demonstrates that generality provides a more consistent indicator of predictive accuracy in the context of a default rule than does complexity. These results suggest that the theorem of decreasing predictive power provides a suitable theoretical framework for the development of learning biases for use in selecting between classifiers with identical empirical support

Yip, S., & Webb, G. I.
Proceedings of the Tenth Biennial Canadian Artificial Intelligence Conference(AI-94), San Francisco, pp. 63-70, 1994.

@InProceedings{YipWebb94b,
Title = {Incorporating Canonical Discriminate Attributes in Classification Learning},
Author = {Yip, S. and Webb, G. I.},
Booktitle = {Proceedings of the Tenth Biennial Canadian Artificial Intelligence Conference(AI-94)},
Year = {1994},
Editor = {Elio, R.},
Pages = {63-70},
Publisher = {Morgan Kaufmann},
Abstract = {This paper describes a method for incorporating canonical discriminant attributes in classification machine learning. Though decision trees and rules have semantic appeal when building expert systems, the merits of discriminant analysis are well documented. For data sets on which discriminant analysis obtains significantly better predictive accuracy than symbolic machine learning, the incorporation of canonical discriminant attributes can benefit machine learning. The process starts by applying canonical discriminant analysis to the training set. The canonical discriminant attributes are included as additional attributes. The expanded data set is then subjected to machine learning. This enables linear combinations of numeric attributes to be incorporated in the classifiers that are learnt. Evaluation on the data sets on which discriminant analysis performs better than most machine learning systems, such as the Iris flowers and Waveform data sets, shows that incorporating the power of discriminant analysis in machine classification learning can significantly improve the predictive accuracy and reduce the complexity of classifiers induced by machine learning systems.},
Keywords = {Constructive Induction},
Related = {feature-construction}
}
ABSTRACT This paper describes a method for incorporating canonical discriminant attributes in classification machine learning. Though decision trees and rules have semantic appeal when building expert systems, the merits of discriminant analysis are well documented. For data sets on which discriminant analysis obtains significantly better predictive accuracy than symbolic machine learning, the incorporation of canonical discriminant attributes can benefit machine learning. The process starts by applying canonical discriminant analysis to the training set. The canonical discriminant attributes are included as additional attributes. The expanded data set is then subjected to machine learning. This enables linear combinations of numeric attributes to be incorporated in the classifiers that are learnt. Evaluation on the data sets on which discriminant analysis performs better than most machine learning systems, such as the Iris flowers and Waveform data sets, shows that incorporating the power of discriminant analysis in machine classification learning can significantly improve the predictive accuracy and reduce the complexity of classifiers induced by machine learning systems.

Yip, S., & Webb, G. I.
Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94), Singapore, pp. 29-36, 1994.

@InProceedings{YipWebb94a,
Title = {Empirical Function Attribute Construction in Classification Learning},
Author = {Yip, S. and Webb, G. I.},
Booktitle = {Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94)},
Year = {1994},
Editor = {Zhang, C. and Debenham, J. and Lukose, D.},
Pages = {29-36},
Publisher = {World Scientific},
Abstract = {The merits of incorporating feature construction to assist selective induction in learning hard concepts are well documented. This paper introduces the notion of function attributes and reports a method of incorporating functional regularities in classifiers. Training sets are preprocessed with this method before submission to a selective induction classification learning system. The method, referred to as FAFA (function attribute finding), is characterised by finding bivariate functions that contribute to the discrimination between classes and then transforming them to function attributes as additional attributes of the data set. The value of each function attribute equals the deviation of each example from the value obtained by applying that function to the example. The expanded data set is then submitted to classification learning. Evaluation with published and artificial data shows that this method can improve classifiers in terms of predictive accuracy and complexity.},
Keywords = {Constructive Induction},
Location = {Armidale,NSW, Australia},
Related = {feature-construction}
}
ABSTRACT The merits of incorporating feature construction to assist selective induction in learning hard concepts are well documented. This paper introduces the notion of function attributes and reports a method of incorporating functional regularities in classifiers. Training sets are preprocessed with this method before submission to a selective induction classification learning system. The method, referred to as FAFA (function attribute finding), is characterised by finding bivariate functions that contribute to the discrimination between classes and then transforming them to function attributes as additional attributes of the data set. The value of each function attribute equals the deviation of each example from the value obtained by applying that function to the example. The expanded data set is then submitted to classification learning. Evaluation with published and artificial data shows that this method can improve classifiers in terms of predictive accuracy and complexity.

Webb, G. I.
Proceedings of the 1993 World Conference on Artificial Intelligence in Education (AI-ED'93), Charlottesville, VA, pp. 497-504, 1993.

@InProceedings{Webb93c,
Title = {Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competency},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the 1993 World Conference on Artificial Intelligence in Education (AI-ED'93)},
Year = {1993},
Editor = {Brna, P. and Ohlsson, S. and Pain, H.},
Pages = {497-504},
Publisher = {AACE},
Abstract = {Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90\% when predicting student solutions. It also demonstrates the ability to identify and model students' buggy arithmetic procedures.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Case Based Learning},
Location = {Edinburgh, Scotland. Also published in User Modeling and User-Adapted Interaction. 5: 117-150, 1996},
Related = {feature-based-modeling}
}
ABSTRACT Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90\% when predicting student solutions. It also demonstrates the ability to identify and model students' buggy arithmetic procedures.

Webb, G. I.
Proceedings (Complement) of the Seventh European Workshop on Knowledge Acquisition for Knowledge-based Systems (EWKA'93), pp. 263-275, 1993.

@InProceedings{Webb93e,
author = {Webb, G. I.},
booktitle = {Proceedings (Complement) of the Seventh European Workshop on Knowledge Acquisition for Knowledge-based Systems (EWKA'93)},
title = {Control, Capabilities and Communication: Three Key Issues for Machine-Expert Collaborative Knowledge Acquisition},
year = {1993},
editor = {Aussenac, N. and Boy, G. and Gaines, B. and Linster, M. and Ganascia, J.G. and Kodratoff, Y.},
pages = {263-275},
publisher = {Springer-Verlag},
abstract = {Machine learning and knowledge elicitation are different but complementary approaches to knowledge acquisition. On the face of it there are large potential gains to be reaped from the integration of these two knowledge acquisition techniques. Machine-expert collaborative knowledge acquisition combines these approaches by placing the machine learning system and the human expert as partners in the knowledge-acquisition task. This paper examines three key issues facing machine-expert collaborative knowledge-acquisition where should control reside, what capabilities should each partner bring to the task and how should the partners communicate?},
audit-trail = {*},
keywords = {Machine Learning with Knowledge Acquisition from Experts},
location = {Toulouse, France},
related = {interactive-machine-learning},
}
ABSTRACT Machine learning and knowledge elicitation are different but complementary approaches to knowledge acquisition. On the face of it there are large potential gains to be reaped from the integration of these two knowledge acquisition techniques. Machine-expert collaborative knowledge acquisition combines these approaches by placing the machine learning system and the human expert as partners in the knowledge-acquisition task. This paper examines three key issues facing machine-expert collaborative knowledge-acquisition where should control reside, what capabilities should each partner bring to the task and how should the partners communicate?

Webb, G. I.
Proceedings of the Sixth Australian Joint Conference on Artificial Intelligence (AI'93), Singapore, pp. 342-347, 1993.

@InProceedings{Webb93a,
Title = {Systematic Search for Categorical Attribute-Value Data-Driven Machine Learning},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the Sixth Australian Joint Conference on Artificial Intelligence (AI'93)},
Year = {1993},
Editor = {Rowles, C. and Liu, H. and Foo, N.},
Pages = {342-347},
Publisher = {World Scientific},
Abstract = {Optimal Pruning for Unordered Search is a search algorithm that enables complete search through the space of possible disjuncts at the inner level of a covering algorithm. This algorithm takes as inputs an evaluation function, e, a training set, t, and a set of specialisation operators, o. It outputs a set of operators from o that creates a classifier that maximises e with respect to t. While OPUS has exponential worst case time complexity, the algorithm is demonstrated to reach solutions for complex real world domains within reasonable time frames. Indeed, for some domains, the algorithm exhibits greater computational efficiency than common heuristic search algorithms.},
Audit-trail = {*},
Keywords = {Search and Rule Learning and OPUS},
Location = {Melbourne, Australia},
Related = {opus-search}
}
ABSTRACT Optimal Pruning for Unordered Search is a search algorithm that enables complete search through the space of possible disjuncts at the inner level of a covering algorithm. This algorithm takes as inputs an evaluation function, e, a training set, t, and a set of specialisation operators, o. It outputs a set of operators from o that creates a classifier that maximises e with respect to t. While OPUS has exponential worst case time complexity, the algorithm is demonstrated to reach solutions for complex real world domains within reasonable time frames. Indeed, for some domains, the algorithm exhibits greater computational efficiency than common heuristic search algorithms.

Webb, G. I., & Brkic, N.
Proceedings of the AI 93 Workshop on Machine Learning and Hybrid Systems, pp. 6-10, 1993.

@InProceedings{WebbBrkic93,
Title = {Learning Decision Lists by Prepending Inferred Rules},
Author = {Webb, G. I. and Brkic, N.},
Booktitle = {Proceedings of the AI 93 Workshop on Machine Learning and Hybrid Systems},
Year = {1993},
Editor = {Sestito, S.},
Pages = {6-10},
Abstract = {This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to front of the list under construction. This contrasts with the original decision list induction algorithm which operates by appending successive rules to end of the list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy than those produced by the original decision list induction algorithm.},
Audit-trail = {*},
Keywords = {Prepend and Rule Learning},
Location = {Melbourne, Australia},
Related = {prepending}
}
ABSTRACT This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to front of the list under construction. This contrasts with the original decision list induction algorithm which operates by appending successive rules to end of the list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy than those produced by the original decision list induction algorithm.

Webb, G. I.
Proceedings of the 1993 IJCAI Workshop W16: Machine Learning and Knowledge Acquisition, pp. 236-252, 1993.

@InProceedings{Webb93d,
author = {Webb, G. I.},
booktitle = {Proceedings of the 1993 {IJCAI} Workshop W16: Machine Learning and Knowledge Acquisition},
title = {DLGref2: Techniques for Inductive Rule Refinement},
year = {1993},
pages = {236-252},
abstract = {This paper describes and evaluates machine learning techniques for knowledge-base refinement. These techniques are central to Einstein, a knowledge acquisition system that enables a human expert to collaborate with a machine learning system at all stages of the knowledge-acquisition cycle. Experimental evaluation demonstrates that the knowledge-base refinement techniques are able to significantly increase the accuracy of nontrivial expert systems in a wide variety of domains.},
audit-trail = {Reconstructed paper posted May 2005},
keywords = {Rule Learning},
location = {Chambery, France},
}
ABSTRACT This paper describes and evaluates machine learning techniques for knowledge-base refinement. These techniques are central to Einstein, a knowledge acquisition system that enables a human expert to collaborate with a machine learning system at all stages of the knowledge-acquisition cycle. Experimental evaluation demonstrates that the knowledge-base refinement techniques are able to significantly increase the accuracy of nontrivial expert systems in a wide variety of domains.

Kuzmycz, M., & Webb, G. I.
Lecture Notes in Computer Science Vol. 608: Proceedings of the Second International Conference on Intelligent Tutoring Systems (ITS'92), Berlin, pp. 269-276, 1992.

@InProceedings{KuzmyczWebb92,
Title = {Evaluation of Feature Based Modelling in Subtraction},
Author = {Kuzmycz, M. and Webb, G. I.},
Booktitle = {Lecture Notes in Computer Science Vol. 608: Proceedings of the Second International Conference on Intelligent Tutoring Systems (ITS'92)},
Year = {1992},
Editor = {Frasson, C. and Gauthier, G. and McCalla, G. I.},
Pages = {269-276},
Publisher = {Springer-Verlag},
Abstract = {One aim of intelligent tutoring systems is to tailor lessons to each individual student's needs. To do this a tutoring system requires a model of the student's knowledge. Cognitive modelling aims to produce a detailed explanation of the student's progress. Feature Based Modelling forms a cognitive model of the student by creating aspects of problem descriptions and of students' responses. This paper will discuss Feature Based Modelling and show the results of an evaluation carried out in the domain of elemental subtraction.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Case Based Learning},
Related = {feature-based-modeling}
}
ABSTRACT One aim of intelligent tutoring systems is to tailor lessons to each individual student's needs. To do this a tutoring system requires a model of the student's knowledge. Cognitive modelling aims to produce a detailed explanation of the student's progress. Feature Based Modelling forms a cognitive model of the student by creating aspects of problem descriptions and of students' responses. This paper will discuss Feature Based Modelling and show the results of an evaluation carried out in the domain of elemental subtraction.

Webb, G. I.
Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92), Singapore, pp. 329-334, 1992.

@InProceedings{Webb92,
Title = {Man-Machine Collaboration for Knowledge Acquisition},
Author = {Webb, G. I.},
Booktitle = {Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92)},
Year = {1992},
Editor = {Adams, A. and Sterling. L.},
Pages = {329-334},
Publisher = {World Scientific},
Abstract = {Both machine learning and knowledge elicitation from human experts have unique strengths and weaknesses. Man-machine collaboration for knowledge acquisition allows both knowledge acquisition techniques to be employed hand- in-hand. The strengths of each can alleviate the other's weaknesses. This has the potential to both reduce the time taken to develop an expert system while increasing the quality of the finished product. This paper discusses techniques for man-machine collaboration for knowledge acquisition and describes Einstein, a computer system that implements those techniques},
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Hobart, Tas., Australia},
Related = {interactive-machine-learning}
}
ABSTRACT Both machine learning and knowledge elicitation from human experts have unique strengths and weaknesses. Man-machine collaboration for knowledge acquisition allows both knowledge acquisition techniques to be employed hand- in-hand. The strengths of each can alleviate the other's weaknesses. This has the potential to both reduce the time taken to develop an expert system while increasing the quality of the finished product. This paper discusses techniques for man-machine collaboration for knowledge acquisition and describes Einstein, a computer system that implements those techniques

Smith, P. A., & Webb, G. I.
A Future Promised: Proceedings of the Fifth Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '92), pp. 351-356, 1992.

@InProceedings{SmithWebb92,
Title = {Recent progress in the Development of a Debugging Assistant for Computer Programs},
Author = {Smith, P. A. and Webb, G. I.},
Booktitle = {A Future Promised: Proceedings of the Fifth Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '92)},
Year = {1992},
Editor = {Chia, B. and Pennell, R. and Sims, R.},
Pages = {351-356},
Abstract = {We present recent progress in the development of a debugging assistant for helping novices debug their computer programs. Bradman, which is still in the implementation phase, is an interactive system which builds two models of the user's program - one reflecting what the program actually does and the other reflecting what the programmer intended to do. Conflicts between these two models are used by Bradman to find bugs in the program.},
Keywords = {Program Visualisation},
Location = {Sydney, Australia},
Related = {program-visualisation}
}
ABSTRACT We present recent progress in the development of a debugging assistant for helping novices debug their computer programs. Bradman, which is still in the implementation phase, is an interactive system which builds two models of the user's program - one reflecting what the program actually does and the other reflecting what the programmer intended to do. Conflicts between these two models are used by Bradman to find bugs in the program.

Agar, J., & Webb, G. I.
Nephrology, Dialysis and Transplantation, 7, 472-478, 1992.
[Bibtex] [Abstract]  → Access on publisher site  → Related papers and software

@Article{AgarWebb92,
author = {Agar, J. and Webb, G. I.},
journal = {Nephrology, Dialysis and Transplantation},
title = {Application Of Machine Learning To A Renal Biopsy Data-Base},
year = {1992},
pages = {472-478},
volume = {7},
abstract = {This pilot study has applied machine learning (artificial intelligence derived qualitative analysis procedures) to yield non-invasive techniques for the assessment and interpretation of clinical and laboratory data in glomerular disease. To evaluate the appropriateness of these techniques, they were applied to subsets of a small database of 284 case histories and the resulting procedures evaluated against the remaining cases. Over such evaluations, the following average diagnostic accuracies were obtained: microscopic polyarteritis, 95.37%; minimal lesion nephrotic syndrome, 96.50%; immunoglobulin A nephropathy, 81.26%; minor changes, 93.66%; lupus nephritis, 96.27%; focal glomerulosclerosis, 92.06%; mesangial proliferative glomerulonephritis, 92.56%; and membranous nephropathy, 92.56%. Although in general the new diagnostic system is not yet as accurate as the histological evaluation of renal biopsy specimens, it shows promise of adding a further dimension to the diagnostic process. When the machine learning techniques are applied to a larger database, greater diagnostic accuracy should be obtained. It may allow accurate non- invasive diagnosis of some cases of glomerular disease without the need for renal biopsy. This may reduce both the cost and the morbidity of the investigation of glomerular disease and may be of particular value in situations where renal biopsy is considered hazardous or contraindicated.},
audit-trail = {28/10/03 Link to abstract only at this stage available via Oxford Press.},
keywords = {Rule Learning, health},
publisher = {Oxford University Press},
related = {health},
url = {http://ndt.oxfordjournals.org/content/7/6/472.abstract},
}
ABSTRACT This pilot study has applied machine learning (artificial intelligence derived qualitative analysis procedures) to yield non-invasive techniques for the assessment and interpretation of clinical and laboratory data in glomerular disease. To evaluate the appropriateness of these techniques, they were applied to subsets of a small database of 284 case histories and the resulting procedures evaluated against the remaining cases. Over such evaluations, the following average diagnostic accuracies were obtained: microscopic polyarteritis, 95.37%; minimal lesion nephrotic syndrome, 96.50%; immunoglobulin A nephropathy, 81.26%; minor changes, 93.66%; lupus nephritis, 96.27%; focal glomerulosclerosis, 92.06%; mesangial proliferative glomerulonephritis, 92.56%; and membranous nephropathy, 92.56%. Although in general the new diagnostic system is not yet as accurate as the histological evaluation of renal biopsy specimens, it shows promise of adding a further dimension to the diagnostic process. When the machine learning techniques are applied to a larger database, greater diagnostic accuracy should be obtained. It may allow accurate non- invasive diagnosis of some cases of glomerular disease without the need for renal biopsy. This may reduce both the cost and the morbidity of the investigation of glomerular disease and may be of particular value in situations where renal biopsy is considered hazardous or contraindicated.

Yip, S., & Webb, G. I.
Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92), Singapore, pp. 374-379, 1992.

@InProceedings{YipWebb92b,
Title = {Discriminate Attribute Finding in Classification Learning},
Author = {Yip, S. and Webb, G. I.},
Booktitle = {Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92)},
Year = {1992},
Re