Geoff Webb Publications

Display with links

Tan, C. W., Webb, G. I., Petitjean, F., & P., R. (in press). Tamping Effectiveness Prediction Using Supervised Machine LearningTechniques. Proceedings of the First International Conference on Rail Transportation (ICRT).
[Bibtex]

@InProceedings{TanEtAl17b,
Title = {Tamping Effectiveness Prediction Using Supervised Machine LearningTechniques},
Author = {Tan, C. W. and Webb, G. I. and Petitjean, F. and Reichl P.},
Booktitle = {Proceedings of the First International Conference on Rail Transportation (ICRT)},
Year = {in press},
Owner = {giwebb},
Timestamp = {2017.05.10}
}
ABSTRACT 

Zaidi, N., Webb, G. I., Carman, M., Petitjean, F., Buntine, W., Hynes, H., & De Sterck, H. (in press). Efficient Parameter Learning of Bayesian Network Classifiers. Machine Learning.
[PDF] [DOI] [Bibtex]  → Related papers and software

@Article{ZaidiEtAl17,
Title = {Efficient Parameter Learning of Bayesian Network Classifiers},
Author = {Zaidi, N. and Webb, Geoffrey I and Carman, M. and Petitjean, F. and Buntine, W. and Hynes, H. and De Sterck, H.},
Journal = {Machine Learning},
Year = {in press},
Doi = {10.1007/s10994-016-5619-z},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://rdcu.be/oP1t}
}
ABSTRACT 

Song, J., Li, C., Zheng, C., Revote, J., Zhang, Z., & Webb, G. I. (in press). MetalExplorer, a Bioinformatics Tool for the Improved Prediction of Eight Types of Metal-binding Sites Using a Random Forest Algorithm with Two-step Feature Selection. Current Bioinformatics, 11.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl16,
Title = {MetalExplorer, a Bioinformatics Tool for the Improved Prediction of Eight Types of Metal-binding Sites Using a Random Forest Algorithm with Two-step Feature Selection},
Author = {Jiangning Song and Chen Li and Cheng Zheng and Jerico Revote and Ziding Zhang and Geoffrey I. Webb},
Journal = {Current Bioinformatics},
Year = {in press},
Volume = {11},
Abstract = {Metalloproteins are highly involved in many biological processes,
including catalysis, recognition, transport, transcription, and signal
transduction. The metal ions they bind usually play enzymatic or structural
roles in mediating these diverse functional roles. Thus, the systematic
analysis and prediction of metal-binding sites using sequence and/or
structural information are crucial for understanding their
sequence-structure-function relationships. In this study, we propose
MetalExplorer (http://metalexplorer.erc.monash.edu.au/), a new machine
learning-based method for predicting eight different types of metal-binding
sites (Ca, Co, Cu, Fe, Ni, Mg, Mn, and Zn) in proteins. Our approach
combines heterogeneous sequence-, structure-, and residue contact
network-based features. The predictive performance of MetalExplorer was
tested by cross-validation and independent tests using non-redundant
datasets of known structures. This method applies a two-step feature
selection approach based on the maximum relevance minimum redundancy and
forward feature selection to identify the most informative features that
contribute to the prediction performance. With a precision of 60%,
MetalExplorer achieved high recall values, which ranged from 59% to 88% for
the eight metal ion types in fivefold cross-validation tests. Moreover, the
common and type-specific features in the optimal subsets of all metal ions
were characterized in terms of their contributions to the overall
performance. In terms of both benchmark and independent datasets at the 60%
precision control level, MetalExplorer compared favorably with an existing
metalloprotein prediction tool, SitePredict. Thus, MetalExplorer is expected
to be a powerful tool for the accurate prediction of potential metal-binding
sites and it should facilitate the functional analysis and rational design
of novel metalloproteins.},
Doi = {10.2174/2468422806666160618091522},
ISSN = {1574-8936/2212-392X},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT Metalloproteins are highly involved in many biological processes, including catalysis, recognition, transport, transcription, and signal transduction. The metal ions they bind usually play enzymatic or structural roles in mediating these diverse functional roles. Thus, the systematic analysis and prediction of metal-binding sites using sequence and/or structural information are crucial for understanding their sequence-structure-function relationships. In this study, we propose MetalExplorer (http://metalexplorer.erc.monash.edu.au/), a new machine learning-based method for predicting eight different types of metal-binding sites (Ca, Co, Cu, Fe, Ni, Mg, Mn, and Zn) in proteins. Our approach combines heterogeneous sequence-, structure-, and residue contact network-based features. The predictive performance of MetalExplorer was tested by cross-validation and independent tests using non-redundant datasets of known structures. This method applies a two-step feature selection approach based on the maximum relevance minimum redundancy and forward feature selection to identify the most informative features that contribute to the prediction performance. With a precision of 60%, MetalExplorer achieved high recall values, which ranged from 59% to 88% for the eight metal ion types in fivefold cross-validation tests. Moreover, the common and type-specific features in the optimal subsets of all metal ions were characterized in terms of their contributions to the overall performance. In terms of both benchmark and independent datasets at the 60% precision control level, MetalExplorer compared favorably with an existing metalloprotein prediction tool, SitePredict. Thus, MetalExplorer is expected to be a powerful tool for the accurate prediction of potential metal-binding sites and it should facilitate the functional analysis and rational design of novel metalloproteins.

An, Y., Wang, J., Li, C., Leier, A., Marquez-Lago, T., Wilksch, J., Zhang, Y., Webb, G. I., Song, J., & Lithgow, T. (in press). Comprehensive assessment and performance improvement of effector protein predictors for bacterial secretion systems III, IV and VI. Briefings in Bioinformatics.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{AnEtAl2016,
Title = {Comprehensive assessment and performance improvement of effector protein predictors for bacterial secretion systems III, IV and VI},
Author = {An, Yi and Wang, Jiawei and Li, Chen and Leier, André and Marquez-Lago, Tatiana and Wilksch, Jonathan and Zhang, Yang and Webb, Geoffrey I. and Song, Jiangning and Lithgow, Trevor},
Journal = {Briefings in Bioinformatics},
Year = {in press},
Abstract = {Bacterial effector proteins secreted by various protein secretion systems play crucial roles in host–pathogen interactions. In this context, computational tools capable of accurately predicting effector proteins of the various types of bacterial secretion systems are highly desirable. Existing computational approaches use different machine learning (ML) techniques and heterogeneous features derived from protein sequences and/or structural information. These predictors differ not only in terms of the used ML methods but also with respect to the used curated data sets, the features selection and their prediction performance. Here, we provide a comprehensive survey and benchmarking of currently available tools for the prediction of effector proteins of bacterial types III, IV and VI secretion systems (T3SS, T4SS and T6SS, respectively). We review core algorithms, feature selection techniques, tool availability and applicability and evaluate the prediction performance based on carefully curated independent test data sets. In an effort to improve predictive performance, we constructed three ensemble models based on ML algorithms by integrating the output of all individual predictors reviewed. Our benchmarks demonstrate that these ensemble models outperform all the reviewed tools for the prediction of effector proteins of T3SS and T4SS. The webserver of the proposed ensemble methods for T3SS and T4SS effector protein prediction is freely available at http://tbooster.erc.monash.edu/index.jsp. We anticipate that this survey will serve as a useful guide for interested users and that the new ensemble predictors will stimulate research into host–pathogen relationships and inspiration for the development of new bioinformatics tools for predicting effector proteins of T3SS, T4SS and T6SS.},
Doi = {10.1093/bib/bbw100},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT Bacterial effector proteins secreted by various protein secretion systems play crucial roles in host–pathogen interactions. In this context, computational tools capable of accurately predicting effector proteins of the various types of bacterial secretion systems are highly desirable. Existing computational approaches use different machine learning (ML) techniques and heterogeneous features derived from protein sequences and/or structural information. These predictors differ not only in terms of the used ML methods but also with respect to the used curated data sets, the features selection and their prediction performance. Here, we provide a comprehensive survey and benchmarking of currently available tools for the prediction of effector proteins of bacterial types III, IV and VI secretion systems (T3SS, T4SS and T6SS, respectively). We review core algorithms, feature selection techniques, tool availability and applicability and evaluate the prediction performance based on carefully curated independent test data sets. In an effort to improve predictive performance, we constructed three ensemble models based on ML algorithms by integrating the output of all individual predictors reviewed. Our benchmarks demonstrate that these ensemble models outperform all the reviewed tools for the prediction of effector proteins of T3SS and T4SS. The webserver of the proposed ensemble methods for T3SS and T4SS effector protein prediction is freely available at http://tbooster.erc.monash.edu/index.jsp. We anticipate that this survey will serve as a useful guide for interested users and that the new ensemble predictors will stimulate research into host–pathogen relationships and inspiration for the development of new bioinformatics tools for predicting effector proteins of T3SS, T4SS and T6SS.

Zaidi, N. A., & Webb, G. I. (2017). A Fast Trust-Region Newton Method for Softmax Logistic Regression. Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 705-713.
[PDF] [Bibtex]

@InProceedings{ZaidiWebb17,
Title = {A Fast Trust-Region Newton Method for Softmax Logistic Regression},
Author = {Zaidi, Nayyar A and Webb, Geoffrey I},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {705-713}
}
ABSTRACT 

Sammut, C., & Webb, G. I. (Ed). (2017). Encyclopedia of Machine Learning and Data Mining. Berlin: Springer.
[DOI] [Bibtex]

@Book{SammutWebb17,
Title = {Encyclopedia of Machine Learning and Data Mining},
Editor = {Sammut, C. and Webb, G.I.},
Publisher = {Springer},
Year = {2017},
Address = {Berlin},
Doi = {10.1007/978-1-4899-7502-7}
}
ABSTRACT 

Hamalainen, W., & Webb, G. I. (2017). Specious rules: an efficient and effective unifying method for removing misleading and uninformative patterns in association rule mining. Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 309-317.
[PDF] [Bibtex]

@InProceedings{HamalainenWebb17,
Title = {Specious rules: an efficient and effective unifying method for removing misleading and uninformative patterns in association rule mining},
Author = {Hamalainen, Wilhelmiina and Webb, Geoffrey I},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {309-317},
Keywords = {association discovery}
}
ABSTRACT 

Webb, G. I., Lee, L. K., Petitjean, F., & Goethals, B. (2017). Understanding Concept Drift (No. arXiv:1704.00362). .
[URL] [Bibtex] [Abstract]

@TechReport{WebbEtAl17,
Title = {Understanding Concept Drift},
Author = {Geoffrey I. Webb and Loong Kuan Lee and Francois Petitjean and Bart Goethals},
Year = {2017},
Number = {arXiv:1704.00362},
Abstract = {Concept drift is a major issue that greatly affects the accuracy and reliability of many real-world applications of machine learning. We argue that to tackle concept drift it is important to develop the capacity to describe and analyze it. We propose tools for this purpose, arguing for the importance of quantitative descriptions of drift in marginal distributions. We present quantitative drift analysis techniques along with methods for communicating their results. We demonstrate their effectiveness by application to three real-world learning tasks.},
Keywords = {Concept Drift},
Owner = {giwebb},
Timestamp = {2017.04.04},
Url = {https://arxiv.org/abs/1704.00362}
}
ABSTRACT Concept drift is a major issue that greatly affects the accuracy and reliability of many real-world applications of machine learning. We argue that to tackle concept drift it is important to develop the capacity to describe and analyze it. We propose tools for this purpose, arguing for the importance of quantitative descriptions of drift in marginal distributions. We present quantitative drift analysis techniques along with methods for communicating their results. We demonstrate their effectiveness by application to three real-world learning tasks.

Fernando, T. L., & Webb, G. I. (2017). SimUSF: an efficient and effective similarity measure that is invariant to violations of the interval scale assumption. Data Mining and Knowledge Discovery, 31(1), 264-286.
[PDF] [DOI] [Bibtex] [Abstract]

@Article{FernandoWebb16,
Title = {SimUSF: an efficient and effective similarity measure that is invariant to violations of the interval scale assumption},
Author = {Fernando, Thilak L. and Webb, Geoffrey I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2017},
Number = {1},
Pages = {264-286},
Volume = {31},
Abstract = {Similarity measures are central to many machine learning algorithms. There are many different similarity measures, each catering for different applications and data requirements. Most similarity measures used with numerical data assume that the attributes are interval scale. In the interval scale, it is assumed that a unit difference has the same meaning irrespective of the magnitudes of the values separated. When this assumption is violated, accuracy may be reduced. Our experiments show that removing the interval scale assumption by transforming data to ranks can improve the accuracy of distance-based similarity measures on some tasks. However the rank transform has high time and storage overheads. In this paper, we introduce an efficient similarity measure which does not consider the magnitudes of inter-instance distances. We compare the new similarity measure with popular similarity measures in two applications: DBScan clustering and content based multimedia information retrieval with real world datasets and different transform functions. The results show that the proposed similarity measure provides good performance on a range of tasks and is invariant to violations of the interval scale assumption.},
Doi = {10.1007/s10618-016-0463-0}
}
ABSTRACT Similarity measures are central to many machine learning algorithms. There are many different similarity measures, each catering for different applications and data requirements. Most similarity measures used with numerical data assume that the attributes are interval scale. In the interval scale, it is assumed that a unit difference has the same meaning irrespective of the magnitudes of the values separated. When this assumption is violated, accuracy may be reduced. Our experiments show that removing the interval scale assumption by transforming data to ranks can improve the accuracy of distance-based similarity measures on some tasks. However the rank transform has high time and storage overheads. In this paper, we introduce an efficient similarity measure which does not consider the magnitudes of inter-instance distances. We compare the new similarity measure with popular similarity measures in two applications: DBScan clustering and content based multimedia information retrieval with real world datasets and different transform functions. The results show that the proposed similarity measure provides good performance on a range of tasks and is invariant to violations of the interval scale assumption.

Tan, C. W., Webb, G. I., & Petitjean, F. (2017). Indexing and classifying gigabytes of time series under time warping. Proceedings of the 2017 SIAM International Conference on Data Mining, pp. 282-290.
[PDF] [Bibtex]

@InProceedings{TanEtAl17a,
Title = {Indexing and classifying gigabytes of time series under time warping},
Author = {Tan, Chang Wei and Webb, Geoffrey I and Petitjean, Fran{\c{c}}ois},
Booktitle = {Proceedings of the 2017 SIAM International Conference on Data Mining},
Year = {2017},
Organization = {SIAM},
Pages = {282-290},
Keywords = {time series}
}
ABSTRACT 

Chen, S., Martinez, A., Webb, G., & Wang, L. (2017). Sample-based Attribute Selective AnDE for Large Data. IEEE Transactions on Knowledge and Data Engineering, 29(1), 172-185.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ChenEtAl16b,
Title = {Sample-based Attribute Selective AnDE for Large Data},
Author = {S. Chen and A. Martinez and G. Webb and L. Wang},
Journal = {IEEE Transactions on Knowledge and Data Engineering},
Year = {2017},
Number = {1},
Pages = {172 - 185},
Volume = {29},
Abstract = {More and more applications come with large data sets in the past decade. However, existing algorithms cannot guarantee to scale well on large data. Averaged n-Dependence Estimators (AnDE) allows for flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence AnDE is especially appropriate for large data learning. In this paper, we propose a sample-based attribute selection technique for AnDE. It needs one more pass through the training data, in which a multitude of approximate AnDE models are built and efficiently assessed by leave-one-out cross validation. The use of a sample reduces the training time. Experiments on 15 large data sets demonstrate that the proposed technique significantly reduces AnDE’s error at the cost of a modest increase in training time. This efficient and scalable out-of-core approach delivers superior or comparable performance to typical in-core Bayesian network classifiers.},
Doi = {10.1109/TKDE.2016.2608881},
ISSN = {1041-4347},
Keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT More and more applications come with large data sets in the past decade. However, existing algorithms cannot guarantee to scale well on large data. Averaged n-Dependence Estimators (AnDE) allows for flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence AnDE is especially appropriate for large data learning. In this paper, we propose a sample-based attribute selection technique for AnDE. It needs one more pass through the training data, in which a multitude of approximate AnDE models are built and efficiently assessed by leave-one-out cross validation. The use of a sample reduces the training time. Experiments on 15 large data sets demonstrate that the proposed technique significantly reduces AnDE’s error at the cost of a modest increase in training time. This efficient and scalable out-of-core approach delivers superior or comparable performance to typical in-core Bayesian network classifiers.

Chen, S., Martínez, A. M., Webb, G. I., & Wang, L. (2017). Selective AnDE for large data learning: a low-bias memory constrained approach. Knowledge and Information Systems, 50(2), 475-503.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ChenEtAl16,
Title = {Selective AnDE for large data learning: a low-bias memory constrained approach},
Author = {Chen, Shenglei
and Mart{\'i}nez, Ana M.
and Webb, Geoffrey I.
and Wang, Limin},
Journal = {Knowledge and Information Systems},
Year = {2017},
Number = {2},
Pages = {475-503},
Volume = {50},
Abstract = {Learning from data that are too big to fit into memory poses great challenges to currently available learning approaches. Averaged n-Dependence Estimators (AnDE) allows for a flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence, AnDE is especially appropriate for learning from large quantities of data. Memory requirement in AnDE, however, increases combinatorially with the number of attributes and the parameter n. In large data learning, number of attributes is often large and we also expect high n to achieve low-bias classification. In order to achieve the lower bias of AnDE with higher n but with less memory requirement, we propose a memory constrained selective AnDE algorithm, in which two passes of learning through training examples are involved. The first pass performs attribute selection on super parents according to available memory, whereas the second one learns an AnDE model with parents only on the selected attributes. Extensive experiments show that the new selective AnDE has considerably lower bias and prediction error relative to A \$\$n'\$\$ n {\textasciiacutex} DE, where \$\$n' = n-1\$\$ n {\textasciiacutex} = n - 1 , while maintaining the same space complexity and similar time complexity. The proposed algorithm works well on categorical data. Numerical data sets need to be discretized first.},
Doi = {10.1007/s10115-016-0937-9},
ISSN = {0219-3116},
Keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Learning from data that are too big to fit into memory poses great challenges to currently available learning approaches. Averaged n-Dependence Estimators (AnDE) allows for a flexible learning from out-of-core data, by varying the value of n (number of super parents). Hence, AnDE is especially appropriate for learning from large quantities of data. Memory requirement in AnDE, however, increases combinatorially with the number of attributes and the parameter n. In large data learning, number of attributes is often large and we also expect high n to achieve low-bias classification. In order to achieve the lower bias of AnDE with higher n but with less memory requirement, we propose a memory constrained selective AnDE algorithm, in which two passes of learning through training examples are involved. The first pass performs attribute selection on super parents according to available memory, whereas the second one learns an AnDE model with parents only on the selected attributes. Extensive experiments show that the new selective AnDE has considerably lower bias and prediction error relative to A \$\$n'\$\$ n {\textasciiacutex} DE, where \$\$n' = n-1\$\$ n {\textasciiacutex} = n - 1 , while maintaining the same space complexity and similar time complexity. The proposed algorithm works well on categorical data. Numerical data sets need to be discretized first.

An, Y., Wang, J., Li, C., Revote, J., Zhang, Y., Naderer, T., Hayashida, M., Akutsu, T., Webb, G. I., Lithgow, T., & Song, J. (2017). SecretEPDB: a comprehensive web-based resource for secreted effector proteins of the bacterial types III, IV and VI secretion systems. Scientific Reports, 7, Art. no. 41031.
[DOI] [Bibtex]  → Related papers and software

@Article{AnEtAl17,
Title = {SecretEPDB: a comprehensive web-based resource for secreted effector proteins of the bacterial types III, IV and VI secretion systems},
Author = {An, Yi and Wang, Jiawei and Li, Chen and Revote, Jerico and Zhang, Yang and Naderer, Thomas and Hayashida, Mirohiro and Akutsu, Tatsuya and Webb, Geoffrey I. and Lithgow, Trevor and Song, Jiangning},
Journal = {Scientific Reports},
Year = {2017},
Volume = {7},
Articlenumber = {41031},
Doi = {10.1038/srep41031},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://rdcu.be/oJ9I}
}
ABSTRACT 

Webb, G. I., Hyde, R., Cao, H., Nguyen, H. L., & Petitjean, F. (2016). Characterizing Concept Drift. Data Mining and Knowledge Discovery, 30(4), 964-994.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbEtAl16,
Title = {Characterizing Concept Drift},
Author = {G.I. Webb and R. Hyde and H. Cao and H.L. Nguyen and F. Petitjean},
Journal = {Data Mining and Knowledge Discovery},
Year = {2016},
Number = {4},
Pages = {964-994},
Volume = {30},
Abstract = {Most machine learning models are static, but the world is dynamic, and increasing online deployment of learned models gives increasing urgency to the development of efficient and effective mechanisms to address learning in the context of non-stationary distributions, or as it is commonly called concept drift. However, the key issue of characterizing the different types of drift that can occur has not previously been subjected to rigorous definition and analysis. In particular, while some qualitative drift categorizations have been proposed, few have been formally defined, and the quantitative descriptions required for precise and objective understanding of learner performance have not existed. We present the first comprehensive framework for quantitative analysis of drift. This supports the development of the first comprehensive set of formal definitions of types of concept drift. The formal definitions clarify ambiguities and identify gaps in previous definitions, giving rise to a new comprehensive taxonomy of concept drift types and a solid foundation for research into mechanisms to detect and address concept drift.},
Doi = {10.1007/s10618-015-0448-4},
Keywords = {Concept Drift},
Related = {learning-from-non-stationary-distributions},
Url = {http://arxiv.org/abs/1511.03816},
Urltext = {Link to prepublication draft}
}
ABSTRACT Most machine learning models are static, but the world is dynamic, and increasing online deployment of learned models gives increasing urgency to the development of efficient and effective mechanisms to address learning in the context of non-stationary distributions, or as it is commonly called concept drift. However, the key issue of characterizing the different types of drift that can occur has not previously been subjected to rigorous definition and analysis. In particular, while some qualitative drift categorizations have been proposed, few have been formally defined, and the quantitative descriptions required for precise and objective understanding of learner performance have not existed. We present the first comprehensive framework for quantitative analysis of drift. This supports the development of the first comprehensive set of formal definitions of types of concept drift. The formal definitions clarify ambiguities and identify gaps in previous definitions, giving rise to a new comprehensive taxonomy of concept drift types and a solid foundation for research into mechanisms to detect and address concept drift.

Petitjean, F., Forestier, G., Webb, G. I., Nicholson, A. E., Chen, Y., & Keogh, E. (2016). Faster and more accurate classification of time series by exploiting a novel dynamic time warping averaging algorithm. Knowledge and Information Systems, 47(1), 1-26.
[PDF] [URL] [Bibtex] [Abstract]

@Article{PetitjeanEtAl16a,
Title = {Faster and more accurate classification of time series by exploiting a novel dynamic time warping averaging algorithm},
Author = {F. Petitjean and G. Forestier and G.I. Webb and A.E. Nicholson and Y. Chen and E. Keogh},
Journal = {Knowledge and Information Systems},
Year = {2016},
Number = {1},
Pages = {1-26},
Volume = {47},
Abstract = {A concerted research effort over the past two decades has heralded significant improvements in both the efficiency and effectiveness of time series classification. The consensus that has emerged in the community is that the best solution is a surprisingly simple one. In virtually all domains, the most accurate classifier is the nearest neighbor algorithm with dynamic time warping as the distance measure. The time complexity of dynamic time warping means that successful deployments on resource-constrained devices remain elusive. Moreover, the recent explosion of interest in wearable computing devices, which typically have limited computational resources, has greatly increased the need for very efficient classification algorithms. A classic technique to obtain the benefits of the nearest neighbor algorithm, without inheriting its undesirable time and space complexity, is to use the nearest centroid algorithm. Unfortunately, the unique properties of (most) time series data mean that the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this paper we demonstrate that we can exploit a recent result by Petitjean et al. to allow meaningful averaging of “warped� time series, which then allows us to create super-efficient nearest “centroid� classifiers that are at least as accurate as their more computationally challenged nearest neighbor relatives. We demonstrate empirically the utility of our approach by comparing it to all the appropriate strawmen algorithms on the ubiquitous UCR Benchmarks and with a case study in supporting insect classification on resource-constrained sensors.},
Keywords = {time series},
Url = {http://link.springer.com/article/10.1007/s10115-015-0878-8}
}
ABSTRACT A concerted research effort over the past two decades has heralded significant improvements in both the efficiency and effectiveness of time series classification. The consensus that has emerged in the community is that the best solution is a surprisingly simple one. In virtually all domains, the most accurate classifier is the nearest neighbor algorithm with dynamic time warping as the distance measure. The time complexity of dynamic time warping means that successful deployments on resource-constrained devices remain elusive. Moreover, the recent explosion of interest in wearable computing devices, which typically have limited computational resources, has greatly increased the need for very efficient classification algorithms. A classic technique to obtain the benefits of the nearest neighbor algorithm, without inheriting its undesirable time and space complexity, is to use the nearest centroid algorithm. Unfortunately, the unique properties of (most) time series data mean that the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this paper we demonstrate that we can exploit a recent result by Petitjean et al. to allow meaningful averaging of “warped� time series, which then allows us to create super-efficient nearest “centroid� classifiers that are at least as accurate as their more computationally challenged nearest neighbor relatives. We demonstrate empirically the utility of our approach by comparing it to all the appropriate strawmen algorithms on the ubiquitous UCR Benchmarks and with a case study in supporting insect classification on resource-constrained sensors.

Martinez, A. M., Webb, G. I., Chen, S., & Zaidi, N. A. (2016). Scalable Learning of Bayesian Network Classifiers. Journal of Machine Learning Research, 17(44), 1-35.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{MartinezEtAl16,
Title = {Scalable Learning of {Bayesian} Network Classifiers},
Author = {Ana M. Martinez and Geoffrey I. Webb and Shenglei Chen and Nayyar A. Zaidi},
Journal = {Journal of Machine Learning Research},
Year = {2016},
Number = {44},
Pages = {1-35},
Volume = {17},
Abstract = {Ever increasing data quantity makes ever more urgent the need for highly scalable learners that have good classification performance. Therefore, an out-of-core learner with excellent time and space complexity, along with high expressivity (that is, capacity to learn very complex multivariate probability distributions) is extremely desirable. This paper presents such a learner. We propose an extension to the k-dependence Bayesian classifier (KDB) that discriminatively selects a sub- model of a full KDB classifier. It requires only one additional pass through the training data, making it a three-pass learner. Our extensive experimental evaluation on 16 large data sets reveals that this out-of-core algorithm achieves competitive classification performance, and substantially better training and classification time than state-of-the-art in-core learners such as random forest and linear and non-linear logistic regression.},
Keywords = {Conditional Probability Estimation and AODE and Learning from large datasets and DP140100087},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {http://jmlr.org/papers/v17/martinez16a.html}
}
ABSTRACT Ever increasing data quantity makes ever more urgent the need for highly scalable learners that have good classification performance. Therefore, an out-of-core learner with excellent time and space complexity, along with high expressivity (that is, capacity to learn very complex multivariate probability distributions) is extremely desirable. This paper presents such a learner. We propose an extension to the k-dependence Bayesian classifier (KDB) that discriminatively selects a sub- model of a full KDB classifier. It requires only one additional pass through the training data, making it a three-pass learner. Our extensive experimental evaluation on 16 large data sets reveals that this out-of-core algorithm achieves competitive classification performance, and substantially better training and classification time than state-of-the-art in-core learners such as random forest and linear and non-linear logistic regression.

Wang, H., Feng, L., Zhang, Z., Webb, G. I., Lin, D., & Song, J. (2016). Crysalis: an integrated server for computational analysis and design of protein crystallization. Scientific Reports, 6, Art. no. 21383.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WangEtAl16,
Title = {Crysalis: an integrated server for computational analysis and design of protein crystallization},
Author = {Wang, H. and Feng, L. and Zhang, Z. and Webb, G. I. and Lin, D. and Song, J.},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
Abstract = {The failure of multi-step experimental procedures to yield diffraction-quality crystals is a major bottleneck in protein structure determination. Accordingly, several bioinformatics methods have been successfully developed and employed to select crystallizable proteins. Unfortunately, the majority of existing in silico methods only allow the prediction of crystallization propensity, seldom enabling computational design of protein mutants that can be targeted for enhancing protein crystallizability. Here, we present Crysalis, an integrated crystallization analysis tool that builds on support-vector regression (SVR) models to facilitate computational protein crystallization prediction, analysis, and design. More specifically, the functionality of this new tool includes: (1) rapid selection of target crystallizable proteins at the proteome level, (2) identification of site non-optimality for protein crystallization and systematic analysis of all potential single-point mutations that might enhance protein crystallization propensity, and (3) annotation of target protein based on predicted structural properties. We applied the design mode of Crysalis to identify site non-optimality for protein crystallization on a proteome-scale, focusing on proteins currently classified as non-crystallizable. Our results revealed that site non-optimality is based on biases related to residues, predicted structures, physicochemical properties, and sequence loci, which provides in-depth understanding of the features influencing protein crystallization. Crysalis is freely available at http://nmrcen.xmu.edu.cn/crysalis/.},
Articlenumber = {21383},
Doi = {10.1038/srep21383},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT The failure of multi-step experimental procedures to yield diffraction-quality crystals is a major bottleneck in protein structure determination. Accordingly, several bioinformatics methods have been successfully developed and employed to select crystallizable proteins. Unfortunately, the majority of existing in silico methods only allow the prediction of crystallization propensity, seldom enabling computational design of protein mutants that can be targeted for enhancing protein crystallizability. Here, we present Crysalis, an integrated crystallization analysis tool that builds on support-vector regression (SVR) models to facilitate computational protein crystallization prediction, analysis, and design. More specifically, the functionality of this new tool includes: (1) rapid selection of target crystallizable proteins at the proteome level, (2) identification of site non-optimality for protein crystallization and systematic analysis of all potential single-point mutations that might enhance protein crystallization propensity, and (3) annotation of target protein based on predicted structural properties. We applied the design mode of Crysalis to identify site non-optimality for protein crystallization on a proteome-scale, focusing on proteins currently classified as non-crystallizable. Our results revealed that site non-optimality is based on biases related to residues, predicted structures, physicochemical properties, and sequence loci, which provides in-depth understanding of the features influencing protein crystallization. Crysalis is freely available at http://nmrcen.xmu.edu.cn/crysalis/.

Webb, G. I., & Petitjean, F. (2016). A multiple test correction for streams and cascades of statistical hypothesis tests. Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16, pp. 1255-1264.
exclamation Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbPetitjean16,
Title = {A multiple test correction for streams and cascades of statistical hypothesis tests},
Author = {Webb, Geoffrey I and Petitjean, Francois},
Booktitle = {Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16},
Year = {2016},
Pages = {1255-1264},
Publisher = {ACM Press},
Abstract = {Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance.
This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed.
To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models.
We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.},
Comment = {Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue},
Doi = {10.1145/2939672.2939775},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {statistically-sound-association-discovery},
Url = {http://dl.acm.org/authorize?N19100}
}
ABSTRACT Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance. This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed. To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models. We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.

Petitjean, F., & Webb, G. I. (2016). Scalable Learning of Graphical Models. Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16, pp. 2131-2132.
[URL] [Bibtex]  → Related papers and software

@InProceedings{PetitjeanWebbTut16,
Title = {Scalable Learning of Graphical Models},
Author = {F. Petitjean and G.I. Webb},
Booktitle = {Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16},
Year = {2016},
Pages = {2131-2132},
Publisher = {ACM Press},
Keywords = {scalable graphical models and Learning from large datasets and DP140100087},
Related = {scalable-graphical-modeling},
Url = {http://dl.acm.org/authorize?N19101}
}
ABSTRACT 

Porebski, B. T., Keleher, S., Hollins, J. J., Nickson, A. A., Marijanovic, E. M., Borg, N. A., Costa, M. G. S., Pearce, M. A., Dai, W., Zhu, L., Irving, J. A., Hoke, D. E., Kass, I., Whisstock, J. C., Bottomley, S. P., Webb, G. I., McGowan, S., & Buckle, A. M. (2016). Smoothing a rugged protein folding landscape by sequence-based redesign. Scientific Reports, 6, Art. no. 33958.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Porebski2016,
Title = {Smoothing a rugged protein folding landscape by sequence-based redesign},
Author = {Porebski, Benjamin T. and Keleher, Shani and Hollins, Jeffrey J. and Nickson, Adrian A. and Marijanovic, Emilia M. and Borg, Natalie A. and Costa, Mauricio G. S. and Pearce, Mary A. and Dai, Weiwen and Zhu, Liguang and Irving, James A. and Hoke, David E. and Kass, Itamar and Whisstock, James C. and Bottomley, Stephen P. and Webb, Geoffrey I. and McGowan, Sheena and Buckle, Ashley M.},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
__markedentry = {[giwebb:]},
Abstract = {The rugged folding landscapes of functional proteins puts them at risk of misfolding and aggregation. Serine protease inhibitors, or serpins, are paradigms for this delicate balance between function and misfolding. Serpins exist in a metastable state that undergoes a major conformational change in order to inhibit proteases. However, conformational labiality of the native serpin fold renders them susceptible to misfolding, which underlies misfolding diseases such as alpha1-antitrypsin deficiency. To investigate how serpins balance function and folding, we used consensus design to create conserpin, a synthetic serpin that folds reversibly, is functional, thermostable, and polymerization resistant. Characterization of its structure, folding and dynamics suggest that consensus design has remodeled the folding landscape to reconcile competing requirements for stability and function. This approach may offer general benefits for engineering functional proteins that have risky folding landscapes, including the removal of aggregation-prone intermediates, and modifying scaffolds for use as protein therapeutics.},
Articlenumber = {33958},
Doi = {10.1038/srep33958},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1038/srep33958}
}
ABSTRACT The rugged folding landscapes of functional proteins puts them at risk of misfolding and aggregation. Serine protease inhibitors, or serpins, are paradigms for this delicate balance between function and misfolding. Serpins exist in a metastable state that undergoes a major conformational change in order to inhibit proteases. However, conformational labiality of the native serpin fold renders them susceptible to misfolding, which underlies misfolding diseases such as alpha1-antitrypsin deficiency. To investigate how serpins balance function and folding, we used consensus design to create conserpin, a synthetic serpin that folds reversibly, is functional, thermostable, and polymerization resistant. Characterization of its structure, folding and dynamics suggest that consensus design has remodeled the folding landscape to reconcile competing requirements for stability and function. This approach may offer general benefits for engineering functional proteins that have risky folding landscapes, including the removal of aggregation-prone intermediates, and modifying scaffolds for use as protein therapeutics.

Chang, C. C. H., Li, C., Webb, G. I., Tey, B., & Song, J. (2016). Periscope: quantitative prediction of soluble protein expression in the periplasm of Escherichia coli. Scientific Reports, 6, Art. no. 21844.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{ChangEtAl2016,
Title = {Periscope: quantitative prediction of soluble protein expression in the periplasm of Escherichia coli},
Author = {C.C.H. Chang and C. Li and G. I. Webb and B. Tey and J. Song},
Journal = {Scientific Reports},
Year = {2016},
Volume = {6},
Abstract = {Periplasmic expression of soluble proteins in Escherichia coli not only offers a much-simplified downstream purification process, but also enhances the probability of obtaining correctly folded and biologically active proteins. Different combinations of signal peptides and target proteins lead to different soluble protein expression levels, ranging from negligible to several grams per litre. Accurate algorithms for rational selection of promising candidates can serve as a powerful tool to complement with current trial-and-error approaches. Accordingly, proteomics studies can be conducted with greater efficiency and cost-effectiveness. Here, we developed a predictor with a two-stage architecture, to predict the real-valued expression level of target protein in the periplasm. The output of the first-stage support vector machine (SVM) classifier determines which second-stage support vector regression (SVR) classifier to be used. When tested on an independent test dataset, the predictor achieved an overall prediction accuracy of 78% and a Pearson’s correlation coefficient (PCC) of 0.77. We further illustrate the relative importance of various features with respect to different models. The results indicate that the occurrence of dipeptide glutamine and aspartic acid is the most important feature for the classification model. Finally, we provide access to the implemented predictor through the Periscope webserver, freely accessible at http://lightning.med.monash.edu/periscope/.},
Articlenumber = {21844},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1038/srep21844}
}
ABSTRACT Periplasmic expression of soluble proteins in Escherichia coli not only offers a much-simplified downstream purification process, but also enhances the probability of obtaining correctly folded and biologically active proteins. Different combinations of signal peptides and target proteins lead to different soluble protein expression levels, ranging from negligible to several grams per litre. Accurate algorithms for rational selection of promising candidates can serve as a powerful tool to complement with current trial-and-error approaches. Accordingly, proteomics studies can be conducted with greater efficiency and cost-effectiveness. Here, we developed a predictor with a two-stage architecture, to predict the real-valued expression level of target protein in the periplasm. The output of the first-stage support vector machine (SVM) classifier determines which second-stage support vector regression (SVR) classifier to be used. When tested on an independent test dataset, the predictor achieved an overall prediction accuracy of 78% and a Pearson’s correlation coefficient (PCC) of 0.77. We further illustrate the relative importance of various features with respect to different models. The results indicate that the occurrence of dipeptide glutamine and aspartic acid is the most important feature for the classification model. Finally, we provide access to the implemented predictor through the Periscope webserver, freely accessible at http://lightning.med.monash.edu/periscope/.

Petitjean, F., Li, T., Tatti, N., & Webb, G. I. (2016). Skopus: Mining top-k sequential patterns under leverage. Data Mining and Knowledge Discovery, 30(5), 1086-1111.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{PetitjeanEtAl16b,
Title = {Skopus: Mining top-k sequential patterns under leverage},
Author = {Petitjean, Francois
and Li, Tao
and Tatti, Nikolaj
and Webb, Geoffrey I.},
Journal = {Data Mining and Knowledge Discovery},
Year = {2016},
Number = {5},
Pages = {1086-1111},
Volume = {30},
Abstract = {This paper presents a framework for exact discovery of the top-k sequential patterns under Leverage. It combines (1) a novel definition of the expected support for a sequential pattern---a concept on which most interestingness measures directly rely---with (2) Skopus: a new branch-and-bound algorithm for the exact discovery of top-k sequential patterns under a given measure of interest. Our interestingness measure employs the partition approach. A pattern is interesting to the extent that it is more frequent than can be explained by assuming independence between any of the pairs of patterns from which it can be composed. The larger the support compared to the expectation under independence, the more interesting is the pattern. We build on these two elements to exactly extract the k sequential patterns with highest leverage, consistent with our definition of expected support. We conduct experiments on both synthetic data with known patterns and real-world datasets; both experiments confirm the consistency and relevance of our approach with regard to the state of the art.},
Doi = {10.1007/s10618-016-0467-9},
ISSN = {1573-756X},
Keywords = {OPUS and Association Rule Discovery and statistically sound discovery},
Related = {statistically-sound-association-discovery}
}
ABSTRACT This paper presents a framework for exact discovery of the top-k sequential patterns under Leverage. It combines (1) a novel definition of the expected support for a sequential pattern---a concept on which most interestingness measures directly rely---with (2) Skopus: a new branch-and-bound algorithm for the exact discovery of top-k sequential patterns under a given measure of interest. Our interestingness measure employs the partition approach. A pattern is interesting to the extent that it is more frequent than can be explained by assuming independence between any of the pairs of patterns from which it can be composed. The larger the support compared to the expectation under independence, the more interesting is the pattern. We build on these two elements to exactly extract the k sequential patterns with highest leverage, consistent with our definition of expected support. We conduct experiments on both synthetic data with known patterns and real-world datasets; both experiments confirm the consistency and relevance of our approach with regard to the state of the art.

Zaidi, N. A., Webb, G. I., Carman, M. J., Petitjean, F., & Cerquides, J. (2016). ALRn: Accelerated higher-order logistic regression. Machine Learning, 104(2), 151-194.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ZaidiEtAl16b,
Title = {{ALRn}: Accelerated higher-order logistic regression},
Author = {Zaidi, Nayyar A.
and Webb, Geoffrey I.
and Carman, Mark J.
and Petitjean, Fran{\c{c}}ois
and Cerquides, Jes{\'u}s},
Journal = {Machine Learning},
Year = {2016},
Number = {2},
Pages = {151-194},
Volume = {104},
Abstract = {This paper introduces Accelerated Logistic Regression: a hybrid generative-discriminative approach to training Logistic Regression with high-order features. We present two main results: (1) that our combined generative-discriminative approach significantly improves the efficiency of Logistic Regression and (2) that incorporating higher order features (i.e. features that are the Cartesian products of the original features) reduces the bias of Logistic Regression, which in turn significantly reduces its error on large datasets. We assess the efficacy of Accelerated Logistic Regression by conducting an extensive set of experiments on 75 standard datasets. We demonstrate its competitiveness, particularly on large datasets, by comparing against state-of-the-art classifiers including Random Forest and Averaged n-Dependence Estimators.},
Doi = {10.1007/s10994-016-5574-8},
ISSN = {1573-0565},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://dx.doi.org/10.1007/s10994-016-5574-8}
}
ABSTRACT This paper introduces Accelerated Logistic Regression: a hybrid generative-discriminative approach to training Logistic Regression with high-order features. We present two main results: (1) that our combined generative-discriminative approach significantly improves the efficiency of Logistic Regression and (2) that incorporating higher order features (i.e. features that are the Cartesian products of the original features) reduces the bias of Logistic Regression, which in turn significantly reduces its error on large datasets. We assess the efficacy of Accelerated Logistic Regression by conducting an extensive set of experiments on 75 standard datasets. We demonstrate its competitiveness, particularly on large datasets, by comparing against state-of-the-art classifiers including Random Forest and Averaged n-Dependence Estimators.

Li, F., Li, C., Revote, J., Zhang, Y., Webb, G. I., Li, J., Song, J., & Lithgow, T. (2016). GlycoMinestruct: a new bioinformatics tool for highly accurate mapping of the human N-linked and O-linked glycoproteomes by incorporating structural features. Scientific Reports, 6, Art. no. 34595.
[DOI] [Bibtex]  → Related papers and software

@Article{LiEtAl16,
Title = {GlycoMinestruct: a new bioinformatics tool for highly accurate mapping of the human N-linked and O-linked glycoproteomes by incorporating structural features},
Author = {Li, Fuyi and Li, Chen and Revote, Jerico and Zhang, Yang and Webb, Geoffrey I. and Li, Jian and Song, Jiangning and Lithgow, Trevor},
Journal = {Scientific Reports},
Year = {2016},
Month = oct,
Volume = {6},
Articlenumber = {34595},
Doi = {10.1038/srep34595},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology}
}
ABSTRACT 

Zaidi, N. A., Petitjean, F., & Webb, G. I. (2016). Preconditioning an Artificial Neural Network Using Naive Bayes. Proceedings of the 20th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining, PAKDD 2016, pp. 341-353.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZaidiEtAl16,
Title = {Preconditioning an Artificial Neural Network Using Naive {Bayes}},
Author = {Zaidi, Nayyar A.
and Petitjean, Fran{\c{c}}ois
and Webb, Geoffrey I.},
Booktitle = {Proceedings of the 20th {Pacific-Asia} Conference on Advances in Knowledge Discovery and Data Mining, {PAKDD} 2016},
Year = {2016},
Editor = {Bailey, James
and Khan, Latifur
and Washio, Takashi
and Dobbie, Gill
and Huang, Zhexue Joshua
and Wang, Ruili},
Pages = {341-353},
Publisher = {Springer International Publishing},
Abstract = {Logistic Regression (LR) is a workhorse of the statistics community and a state-of-the-art machine learning classifier. It learns a linear model from inputs to outputs trained by optimizing the Conditional Log-Likelihood (CLL) of the data. Recently, it has been shown that preconditioning LR using a Naive Bayes (NB) model speeds up LR learning many-fold. One can, however, train a linear model by optimizing the mean-square-error (MSE) instead of CLL. This leads to an Artificial Neural Network (ANN) with no hidden layer. In this work, we study the effect of NB preconditioning on such an ANN classifier. Optimizing MSE instead of CLL may lead to a lower bias classifier and hence result in better performance on big datasets. We show that this NB preconditioning can speed-up convergence significantly. We also show that optimizing a linear model with MSE leads to a lower bias classifier than optimizing with CLL. We also compare the performance to state-of-the-art classifier Random Forest.},
Doi = {10.1007/978-3-319-31753-3_28},
ISBN = {978-3-319-31753-3},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://dx.doi.org/10.1007/978-3-319-31753-3_28}
}
ABSTRACT Logistic Regression (LR) is a workhorse of the statistics community and a state-of-the-art machine learning classifier. It learns a linear model from inputs to outputs trained by optimizing the Conditional Log-Likelihood (CLL) of the data. Recently, it has been shown that preconditioning LR using a Naive Bayes (NB) model speeds up LR learning many-fold. One can, however, train a linear model by optimizing the mean-square-error (MSE) instead of CLL. This leads to an Artificial Neural Network (ANN) with no hidden layer. In this work, we study the effect of NB preconditioning on such an ANN classifier. Optimizing MSE instead of CLL may lead to a lower bias classifier and hence result in better performance on big datasets. We show that this NB preconditioning can speed-up convergence significantly. We also show that optimizing a linear model with MSE leads to a lower bias classifier than optimizing with CLL. We also compare the performance to state-of-the-art classifier Random Forest.

Zhang, A., Shi, W., & Webb, G. I. (2016). Mining significant association rules from uncertain data. Data Mining and Knowledge Discovery, 30(4), 928-963.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ZhangEtAl16,
Title = {Mining significant association rules from uncertain data},
Author = {Zhang, Anshu and Shi, Wenzhong and Webb, Geoffrey I},
Journal = {Data Mining and Knowledge Discovery},
Year = {2016},
Number = {4},
Pages = {928-963},
Volume = {30},
Abstract = {In association rule mining, the trade-off between avoiding harmful spurious rules and preserving authentic ones is an ever critical barrier to obtaining reliable and useful results. The statistically sound technique for evaluating statistical significance of association rules is superior in preventing spurious rules, yet can also cause severe loss of true rules in presence of data error. This study presents a new and improved method for statistical test on association rules with uncertain erroneous data. An original mathematical model was established to describe data error propagation through computational procedures of the statistical test. Based on the error model, a scheme combining analytic and simulative processes was designed to correct the statistical test for distortions caused by data error. Experiments on both synthetic and real-world data show that the method significantly recovers the loss in true rules (reduces type-2 error) due to data error occurring in original statistically sound method. Meanwhile, the new method maintains effective control over the familywise error rate, which is the distinctive advantage of the original statistically sound technique. Furthermore, the method is robust against inaccurate data error probability information and situations not fulfilling the commonly accepted assumption on independent error probabilities of different data items. The method is particularly effective for rules which were most practically meaningful yet sensitive to data error. The method proves promising in enhancing values of association rule mining results and helping users make correct decisions.},
Doi = {10.1007/s10618-015-0446-6},
Keywords = {Association Rule Discovery and statistically sound discovery},
Publisher = {Springer},
Related = {statistically-sound-association-discovery}
}
ABSTRACT In association rule mining, the trade-off between avoiding harmful spurious rules and preserving authentic ones is an ever critical barrier to obtaining reliable and useful results. The statistically sound technique for evaluating statistical significance of association rules is superior in preventing spurious rules, yet can also cause severe loss of true rules in presence of data error. This study presents a new and improved method for statistical test on association rules with uncertain erroneous data. An original mathematical model was established to describe data error propagation through computational procedures of the statistical test. Based on the error model, a scheme combining analytic and simulative processes was designed to correct the statistical test for distortions caused by data error. Experiments on both synthetic and real-world data show that the method significantly recovers the loss in true rules (reduces type-2 error) due to data error occurring in original statistically sound method. Meanwhile, the new method maintains effective control over the familywise error rate, which is the distinctive advantage of the original statistically sound technique. Furthermore, the method is robust against inaccurate data error probability information and situations not fulfilling the commonly accepted assumption on independent error probabilities of different data items. The method is particularly effective for rules which were most practically meaningful yet sensitive to data error. The method proves promising in enhancing values of association rule mining results and helping users make correct decisions.

Petitjean, F., & Webb, G. I. (2015). Scaling log-linear analysis to datasets with thousands of variables. Proceedings of the 2015 SIAM International Conference on Data Mining, pp. 469-477.
exclamation Best Research Paper Honorable Mention Award
[URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{PetitjeanWebb15,
Title = {Scaling log-linear analysis to datasets with thousands of variables},
Author = {F. Petitjean and G.I. Webb},
Booktitle = {Proceedings of the 2015 {SIAM} International Conference on Data Mining},
Year = {2015},
Pages = {469-477},
Abstract = {Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.},
Comment = {Best Research Paper Honorable Mention Award},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {scalable-graphical-modeling},
Url = {http://epubs.siam.org/doi/pdf/10.1137/1.9781611974010.53}
}
ABSTRACT Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.

Porebski, B. T., Nickson, A. A., Hoke, D. E., Hunter, M. R., Zhu, L., McGowan, S., Webb, G. I., & Buckle, A. M. (2015). Structural and dynamic properties that govern the stability of an engineered fibronectin type III domain. Protein Engineering, Design and Selection, 28(3), 67-78.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{PorebskiEtAl15,
Title = {Structural and dynamic properties that govern the stability of an engineered fibronectin type III domain},
Author = {B. T. Porebski and A. A. Nickson and D. E. Hoke and M. R. Hunter and L. Zhu and S. McGowan and G. I. Webb and A. M. Buckle},
Journal = {Protein Engineering, Design and Selection},
Year = {2015},
Number = {3},
Pages = {67-78},
Volume = {28},
Abstract = {Consensus protein design is a rapid and reliable technique for the improvement of protein stability, which relies on the use of homologous protein sequences. To enhance the stability of a fibronectin type III (FN3) domain, consensus design was employed using an alignment of 2123 sequences. The resulting FN3 domain, FN3con, has unprecedented stability, with a melting temperature >100°C, a .GD.N of 15.5 kcal mol.1 and a greatly reduced unfolding rate compared with wild-type. To determine the underlying molecular basis for stability, an X-ray crystal structure of FN3con was determined to 2.0 Å and compared with other FN3 domains of varying stabilities. The structure of FN3con reveals significantly increased salt bridge interactions that are cooperatively networked, and a highly optimized hydrophobic core. Molecular dynamics simulations of FN3con and comparison structures show the cooperative power of electrostatic and hydrophobic networks in improving FN3con stability. Taken together, our data reveal that FN3con stability does not result from a single mechanism, but rather the combination of several features and the removal of non-conserved, unfavorable interactions. The large number of sequences employed in this study has most likely enhanced the robustness of the consensus design, which is now possible due to the increased sequence availability in the post-genomic era. These studies increase our knowledge of the molecular mechanisms that govern stability and demonstrate the rising potential for enhancing stability via the consensus method.},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://peds.oxfordjournals.org/content/28/3/67.full.pdf+html}
}
ABSTRACT Consensus protein design is a rapid and reliable technique for the improvement of protein stability, which relies on the use of homologous protein sequences. To enhance the stability of a fibronectin type III (FN3) domain, consensus design was employed using an alignment of 2123 sequences. The resulting FN3 domain, FN3con, has unprecedented stability, with a melting temperature >100°C, a .GD.N of 15.5 kcal mol.1 and a greatly reduced unfolding rate compared with wild-type. To determine the underlying molecular basis for stability, an X-ray crystal structure of FN3con was determined to 2.0 Å and compared with other FN3 domains of varying stabilities. The structure of FN3con reveals significantly increased salt bridge interactions that are cooperatively networked, and a highly optimized hydrophobic core. Molecular dynamics simulations of FN3con and comparison structures show the cooperative power of electrostatic and hydrophobic networks in improving FN3con stability. Taken together, our data reveal that FN3con stability does not result from a single mechanism, but rather the combination of several features and the removal of non-conserved, unfavorable interactions. The large number of sequences employed in this study has most likely enhanced the robustness of the consensus design, which is now possible due to the increased sequence availability in the post-genomic era. These studies increase our knowledge of the molecular mechanisms that govern stability and demonstrate the rising potential for enhancing stability via the consensus method.

Li, F., Li, C., Wang, M., Webb, G. I., Zhang, Y., Whisstock, J. C., & Song, J. (2015). GlycoMine: a machine learning-based approach for predicting N-, C- and O-linked glycosylation in the human proteome. Bioinformatics, 31(9), 1411-1419.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{LiEtAl15,
Title = {GlycoMine: a machine learning-based approach for predicting N-, C- and O-linked glycosylation in the human proteome},
Author = {F. Li and C. Li and M. Wang and G. I. Webb and Y. Zhang and J. C. Whisstock and J. Song},
Journal = {Bioinformatics},
Year = {2015},
Number = {9},
Pages = {1411-1419},
Volume = {31},
Abstract = {Motivation: Glycosylation is a ubiquitous type of protein post-translational modification (PTM) in eukaryotic cells, which plays vital roles in various biological processes (BPs) such as cellular communication, ligand recognition and subcellular recognition. It is estimated that >50% of the entire human proteome is glycosylated. However, it is still a significant challenge to identify glycosylation sites, which requires expensive/laborious experimental research. Thus, bioinformatics approaches that can predict the glycan occupancy at specific sequons in protein sequences would be useful for understanding and utilizing this important PTM.
Results: In this study, we present a novel bioinformatics tool called GlycoMine, which is a comprehensive tool for the systematic in silico identification of C-linked, N-linked, and O-linked glycosylation sites in the human proteome. GlycoMine was developed using the random forest algorithm and evaluated based on a well-prepared up-to-date benchmark dataset that encompasses all three types of glycosylation sites, which was curated from multiple public resources. Heterogeneous sequences and functional features were derived from various sources, and subjected to further two-step feature selection to characterize a condensed subset of optimal features that contributed most to the type-specific prediction of glycosylation sites. Five-fold cross-validation and independent tests show that this approach significantly improved the prediction performance compared with four existing prediction tools: NetNGlyc, NetOGlyc, EnsembleGly and GPP. We demonstrated that this tool could identify candidate glycosylation sites in case study proteins and applied it to identify many high-confidence glycosylation target proteins by screening the entire human proteome.},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1093/bioinformatics/btu852}
}
ABSTRACT Motivation: Glycosylation is a ubiquitous type of protein post-translational modification (PTM) in eukaryotic cells, which plays vital roles in various biological processes (BPs) such as cellular communication, ligand recognition and subcellular recognition. It is estimated that >50% of the entire human proteome is glycosylated. However, it is still a significant challenge to identify glycosylation sites, which requires expensive/laborious experimental research. Thus, bioinformatics approaches that can predict the glycan occupancy at specific sequons in protein sequences would be useful for understanding and utilizing this important PTM. Results: In this study, we present a novel bioinformatics tool called GlycoMine, which is a comprehensive tool for the systematic in silico identification of C-linked, N-linked, and O-linked glycosylation sites in the human proteome. GlycoMine was developed using the random forest algorithm and evaluated based on a well-prepared up-to-date benchmark dataset that encompasses all three types of glycosylation sites, which was curated from multiple public resources. Heterogeneous sequences and functional features were derived from various sources, and subjected to further two-step feature selection to characterize a condensed subset of optimal features that contributed most to the type-specific prediction of glycosylation sites. Five-fold cross-validation and independent tests show that this approach significantly improved the prediction performance compared with four existing prediction tools: NetNGlyc, NetOGlyc, EnsembleGly and GPP. We demonstrated that this tool could identify candidate glycosylation sites in case study proteins and applied it to identify many high-confidence glycosylation target proteins by screening the entire human proteome.

Cao, L., Zhang, C., Joachims, T., Webb, G. I., Margineantu, D. D., & Williams, G. (Ed). (2015). Proceedings of the 21th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining. ACM.
[URL] [Bibtex]

@Proceedings{WebbKDD2015,
Title = {Proceedings of the 21th {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining},
Year = {2015},
Editor = {L. Cao and C. Zhang and T. Joachims and G. I. Webb and D. D. Margineantu and G. Williams},
Publisher = {ACM},
Url = {http://dl.acm.org/citation.cfm?id=2783258&CFID=585807029&CFTOKEN=47444098},
Urltext = {Link to proceedings}
}
ABSTRACT 

Zaidi, N., Carman, M., Cerquides, J., & Webb, G. I. (2014). Naive-Bayes Inspired Effective Pre-Conditioner for Speeding-up Logistic Regression. Proceedings of the 14th IEEE International Conference on Data Mining, pp. 1097-1102.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZaidiEtAl14,
Title = {Naive-{Bayes} Inspired Effective Pre-Conditioner for Speeding-up Logistic Regression},
Author = {N. Zaidi and M. Carman and J. Cerquides and G.I. Webb},
Booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
Year = {2014},
Pages = {1097-1102},
Abstract = {We propose an alternative parameterization of
Logistic Regression (LR) for the categorical data, multi-class
setting. LR optimizes the conditional log-likelihood over the
training data and is based on an iterative optimization procedure
to tune this objective function. The optimization procedure
employed may be sensitive to scale and hence an effective
pre-conditioning method is recommended. Many problems in
machine learning involve arbitrary scales or categorical data
(where simple standardization of features is not applicable).
The problem can be alleviated by using optimization routines
that are invariant to scale such as (second-order) Newton
methods. However, computing and inverting the Hessian is a
costly procedure and not feasible for big data. Thus one must
often rely on first-order methods such as gradient descent (GD),
stochastic gradient descent (SGD) or approximate secondorder
such as quasi-Newton (QN) routines, which are not
invariant to scale. This paper proposes a simple yet effective
pre-conditioner for speeding-up LR based on naive Bayes
conditional probability estimates. The idea is to scale each
attribute by the log of the conditional probability of that
attribute given the class. This formulation substantially speeds up
LR's convergence. It also provides a weighted naive Bayes
formulation which yields an effective framework for hybrid
generative-discriminative classification.},
Keywords = {Conditional Probability Estimation and WANBIA and DP140100087},
Related = {combining-generative-and-discriminative-learning},
Url = {http://dx.doi.org/10.1109/ICDM.2014.53}
}
ABSTRACT We propose an alternative parameterization of Logistic Regression (LR) for the categorical data, multi-class setting. LR optimizes the conditional log-likelihood over the training data and is based on an iterative optimization procedure to tune this objective function. The optimization procedure employed may be sensitive to scale and hence an effective pre-conditioning method is recommended. Many problems in machine learning involve arbitrary scales or categorical data (where simple standardization of features is not applicable). The problem can be alleviated by using optimization routines that are invariant to scale such as (second-order) Newton methods. However, computing and inverting the Hessian is a costly procedure and not feasible for big data. Thus one must often rely on first-order methods such as gradient descent (GD), stochastic gradient descent (SGD) or approximate secondorder such as quasi-Newton (QN) routines, which are not invariant to scale. This paper proposes a simple yet effective pre-conditioner for speeding-up LR based on naive Bayes conditional probability estimates. The idea is to scale each attribute by the log of the conditional probability of that attribute given the class. This formulation substantially speeds up LR's convergence. It also provides a weighted naive Bayes formulation which yields an effective framework for hybrid generative-discriminative classification.

Chen, S., Martinez, A., & Webb, G. I. (2014). Highly Scalable Attribute Selection for AODE. Proceedings of the 18th Pacific-Asia Conference on Knowledge Discovery and Data Mining, pp. 86-97.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ChenEtAl14,
Title = {Highly Scalable Attribute Selection for AODE},
Author = {S. Chen and A. Martinez and G.I. Webb},
Booktitle = {Proceedings of the 18th {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining},
Year = {2014},
Pages = {86-97},
Abstract = {Averaged One-Dependence Estimators (AODE) is a popular
and effective approach to Bayesian learning. In this paper, a new
attribute selection approach is proposed for AODE. It can search in a
large model space, while it requires only a single extra pass through the
training data, resulting in a computationally efficient two-pass learning
algorithm. The experimental results indicate that the new technique significantly
reduces AODE.s bias at the cost of a modest increase in training
time. Its low bias and computational efficiency make it an attractive
algorithm for learning from big data.},
Keywords = {Conditional Probability Estimation and AODE and DP140100087},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {http://dx.doi.org/10.1007/978-3-319-06605-9_8}
}
ABSTRACT Averaged One-Dependence Estimators (AODE) is a popular and effective approach to Bayesian learning. In this paper, a new attribute selection approach is proposed for AODE. It can search in a large model space, while it requires only a single extra pass through the training data, resulting in a computationally efficient two-pass learning algorithm. The experimental results indicate that the new technique significantly reduces AODE.s bias at the cost of a modest increase in training time. Its low bias and computational efficiency make it an attractive algorithm for learning from big data.

Webb, G. I., & Vreeken, J. (2014). Efficient Discovery of the Most Interesting Associations. ACM Transactions on Knowledge Discovery from Data, 8(3), Art. no. 15.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbVreeken13,
Title = {Efficient Discovery of the Most Interesting Associations},
Author = {G.I. Webb and J. Vreeken},
Journal = {{ACM} Transactions on Knowledge Discovery from Data},
Year = {2014},
Number = {3},
Volume = {8},
Abstract = {Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations
in data. However, their computation appears highly demanding, as assessing whether an itemset is selfsufficient
requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as
consideration of all supersets. This paper presents the first published algorithm for efficiently discovering
self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms
based on upper-bounds on itemset value and statistical significance level. It demonstrates that finding top-k
productive and non-redundant itemsets, with post processing to identify those that are not independently
productive, can efficiently identify small sets of key associations. We present extensive evaluation of the
strengths and limitations of the technique including comparisons with alternative approaches to finding the
most interesting associations.},
Articlenumber = {15},
Direct-url = {http://dx.doi.org/10.1145/2601433},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Publisher = {ACM},
Related = {filtered-top-k-association-discovery},
Url = {http://dl.acm.org/authorize?N80829}
}
ABSTRACT Self-sufficient itemsets have been proposed as an effective approach to summarizing the key associations in data. However, their computation appears highly demanding, as assessing whether an itemset is selfsufficient requires consideration of all pairwise partitions of the itemset into pairs of subsets as well as consideration of all supersets. This paper presents the first published algorithm for efficiently discovering self-sufficient itemsets. This branch-and-bound algorithm deploys two powerful pruning mechanisms based on upper-bounds on itemset value and statistical significance level. It demonstrates that finding top-k productive and non-redundant itemsets, with post processing to identify those that are not independently productive, can efficiently identify small sets of key associations. We present extensive evaluation of the strengths and limitations of the technique including comparisons with alternative approaches to finding the most interesting associations.

Petitjean, F., Allison, L., & Webb, G. I. (2014). A Statistically Efficient and Scalable Method for Log-Linear Analysis of High-Dimensional Data. Proceedings of the 14th IEEE International Conference on Data Mining, pp. 480-489.
exclamation One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{PetitjeanEtAl14a,
Title = {A Statistically Efficient and Scalable Method for Log-Linear Analysis of High-Dimensional Data},
Author = {F. Petitjean and L. Allison and G.I. Webb},
Booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
Year = {2014},
Pages = {480-489},
Abstract = {Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches -- statistical efficiency -- 2) controls for the risk of false discoveries as well as statistical approaches -- high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer -- computational efficiency.},
Comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and DP140100087},
Related = {scalable-graphical-modeling},
Url = {http://dx.doi.org/10.1109/ICDM.2014.23}
}
ABSTRACT Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches -- statistical efficiency -- 2) controls for the risk of false discoveries as well as statistical approaches -- high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer -- computational efficiency.

Petitjean, F., Forestier, G., Webb, G. I., Nicholson, A., Chen, Y., & Keogh, E. (2014). Dynamic Time Warping Averaging of Time Series Allows Faster and More Accurate Classification. Proceedings of the 14th IEEE International Conference on Data Mining, pp. 470-479.
exclamation One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
[PDF] [URL] [Bibtex] [Abstract]

@InProceedings{PetitjeanEtAl14b,
Title = {Dynamic Time Warping Averaging of Time Series Allows Faster and More Accurate Classification},
Author = {F. Petitjean and G. Forestier and G.I. Webb and A. Nicholson and Y. Chen and E. Keogh},
Booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
Year = {2014},
Pages = {470-479},
Abstract = {Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.},
Comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
Keywords = {time series},
Url = {http://dx.doi.org/10.1109/ICDM.2014.27}
}
ABSTRACT Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.

Provost, F., Webb, G. I., Bekkerman, R., Etzioni, O., Fayyad, U., & Perlich, C. (2014). A Data Scientist's Guide to Start-Ups. Big Data, 2(3), 117-128.
[URL] [Bibtex] [Abstract]

@Article{ProvostEtAl14,
Title = {A Data Scientist's Guide to Start-Ups},
Author = {F. Provost and G. I. Webb and R. Bekkerman and O. Etzioni and U. Fayyad and C. Perlich},
Journal = {Big Data},
Year = {2014},
Number = {3},
Pages = {117-128},
Volume = {2},
Abstract = {In August 2013, we held a panel discussion at the KDD 2013 conference in Chicago on the subject of data science, data scientists, and start-ups. KDD is the premier conference on data science research and practice. The panel discussed the pros and cons for top-notch data scientists of the hot data science start-up scene. In this article, we first present background on our panelists. Our four panelists have unquestionable pedigrees in data science and substantial experience with start-ups from multiple perspectives (founders, employees, chief scientists, venture capitalists). For the casual reader, we next present a brief summary of the experts' opinions on eight of the issues the panel discussed. The rest of the article presents a lightly edited transcription of the entire panel discussion.},
Keywords = {Big Data},
Url = {http://dx.doi.org/10.1089/big.2014.0031}
}
ABSTRACT In August 2013, we held a panel discussion at the KDD 2013 conference in Chicago on the subject of data science, data scientists, and start-ups. KDD is the premier conference on data science research and practice. The panel discussed the pros and cons for top-notch data scientists of the hot data science start-up scene. In this article, we first present background on our panelists. Our four panelists have unquestionable pedigrees in data science and substantial experience with start-ups from multiple perspectives (founders, employees, chief scientists, venture capitalists). For the casual reader, we next present a brief summary of the experts' opinions on eight of the issues the panel discussed. The rest of the article presents a lightly edited transcription of the entire panel discussion.

Webb, G. I. (2014). Contrary to Popular Belief Incremental Discretization can be Sound, Computationally Efficient and Extremely Useful for Streaming Data. Proceedings of the 14th IEEE International Conference on Data Mining, pp. 1031-1036.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb14,
Title = {Contrary to Popular Belief Incremental Discretization can be Sound, Computationally Efficient and Extremely Useful for Streaming Data},
Author = {G.I. Webb},
Booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
Year = {2014},
Pages = {1031-1036},
Abstract = {Discretization of streaming data has received surprisingly
little attention. This might be because streaming data
require incremental discretization with cutpoints that may vary
over time and this is perceived as undesirable. We argue, to
the contrary, that it can be desirable for a discretization to
evolve in synchronization with an evolving data stream, even
when the learner assumes that attribute values. meanings remain
invariant over time. We examine the issues associated with
discretization in the context of distribution drift and develop
computationally efficient incremental discretization algorithms.
We show that discretization can reduce the error of a classical
incremental learner and that allowing a discretization to drift in
synchronization with distribution drift can further reduce error.},
Keywords = {Concept Drift and Discretization and Incremental Learning and Stream mining},
Related = {learning-from-non-stationary-distributions},
Url = {http://dx.doi.org/10.1109/ICDM.2014.123}
}
ABSTRACT Discretization of streaming data has received surprisingly little attention. This might be because streaming data require incremental discretization with cutpoints that may vary over time and this is perceived as undesirable. We argue, to the contrary, that it can be desirable for a discretization to evolve in synchronization with an evolving data stream, even when the learner assumes that attribute values. meanings remain invariant over time. We examine the issues associated with discretization in the context of distribution drift and develop computationally efficient incremental discretization algorithms. We show that discretization can reduce the error of a classical incremental learner and that allowing a discretization to drift in synchronization with distribution drift can further reduce error.

Li, Y., Wang, M., Wang, H., Tan, H., Zhang, Z., Webb, G. I., & Song, J. (2014). Accurate in Silico Identification of Species-Specific Acetylation Sites by Integrating Protein Sequence-Derived and Functional Features. Scientific Reports, 4, Art. no. 5765.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{LiEtAl2014,
Title = {Accurate in Silico Identification of Species-Specific Acetylation Sites by Integrating Protein Sequence-Derived and Functional Features},
Author = {Y. Li and M. Wang and H. Wang and H. Tan and Z. Zhang and G. I. Webb and J. Song},
Journal = {Scientific Reports},
Year = {2014},
Volume = {4},
Abstract = {Lysine acetylation is a reversible post-translational modification, playing an important role in cytokine signaling, transcriptional regulation, and apoptosis. To fully understand acetylation mechanisms, identification of substrates and specific acetylation sites is crucial. Experimental identification is often time-consuming and expensive. Alternative bioinformatics methods are cost-effective and can be used in a high-throughput manner to generate relatively precise predictions. Here we develop a method termed as SSPKA for species-specific lysine acetylation prediction, using random forest classifiers that combine sequence-derived and functional features with two-step feature selection. Feature importance analysis indicates functional features, applied for lysine acetylation site prediction for the first time, significantly improve the predictive performance. We apply the SSPKA model to screen the entire human proteome and identify many high-confidence putative substrates that are not previously identified. The results along with the implemented Java tool, serve as useful resources to elucidate the mechanism of lysine acetylation and facilitate hypothesis-driven experimental design and validation.},
Articlenumber = {5765},
Keywords = {Bioinformatics and DP140100087},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1038/srep05765}
}
ABSTRACT Lysine acetylation is a reversible post-translational modification, playing an important role in cytokine signaling, transcriptional regulation, and apoptosis. To fully understand acetylation mechanisms, identification of substrates and specific acetylation sites is crucial. Experimental identification is often time-consuming and expensive. Alternative bioinformatics methods are cost-effective and can be used in a high-throughput manner to generate relatively precise predictions. Here we develop a method termed as SSPKA for species-specific lysine acetylation prediction, using random forest classifiers that combine sequence-derived and functional features with two-step feature selection. Feature importance analysis indicates functional features, applied for lysine acetylation site prediction for the first time, significantly improve the predictive performance. We apply the SSPKA model to screen the entire human proteome and identify many high-confidence putative substrates that are not previously identified. The results along with the implemented Java tool, serve as useful resources to elucidate the mechanism of lysine acetylation and facilitate hypothesis-driven experimental design and validation.

Petitjean, F., Webb, G. I., & Nicholson, A. E. (2013). Scaling log-linear analysis to high-dimensional data. Proceedings of the 13th IEEE International Conference on Data Mining, pp. 597-606.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{PetitjeanEtAl13,
Title = {Scaling log-linear analysis to high-dimensional data},
Author = {F. Petitjean and G. I. Webb and A. E. Nicholson},
Booktitle = {Proceedings of the 13th {IEEE} International Conference on Data Mining},
Year = {2013},
Pages = {597-606},
Abstract = {Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We develop an efficient approach to log-linear analysis that scales to hundreds of variables by melding the classical statistical machinery of log-linear analysis with advanced data mining techniques from association discovery and graphical modeling.},
Doi = {10.1109/ICDM.2013.17},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {scalable-graphical-modeling}
}
ABSTRACT Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We develop an efficient approach to log-linear analysis that scales to hundreds of variables by melding the classical statistical machinery of log-linear analysis with advanced data mining techniques from association discovery and graphical modeling.

Provost, F., & Webb, G. I. (2013). Panel: a data scientist's guide to making money from start-ups. Proceedings of the 9th ACM SIGKDD International Conference on knowledge Discovery and Data Mining, pp. 1445-1445.
[URL] [Bibtex]

@InProceedings{ProvostWebb13,
Title = {Panel: a data scientist's guide to making money from start-ups},
Author = {F. Provost and G. I. Webb},
Booktitle = {Proceedings of the 9th {ACM} {SIGKDD} International Conference on knowledge Discovery and Data Mining},
Year = {2013},
Pages = {1445-1445},
Url = {http://dl.acm.org/citation.cfm?doid=2487575.2494523}
}
ABSTRACT 

Zaidi, N., & Webb, G. I. (2013). Fast and Effective Single Pass Bayesian Learning. Proceedings of the 17th Pacific-Asia Conference on Knowledge Discovery and Data Mining, pp. 149-160.
[PDF] [DOI] [Bibtex]  → Related papers and software

@InProceedings{ZaidiWebb13,
Title = {Fast and Effective Single Pass Bayesian Learning},
Author = {N. Zaidi and G. I. Webb},
Booktitle = {Proceedings of the 17th {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining},
Year = {2013},
Pages = {149-160},
Doi = {10.1007/978-3-642-37453-1_13},
Keywords = {Conditional Probability Estimation and AODE},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT 

Suraweera, P., Webb, G. I., Evans, I., & Wallace, M. (2013). Learning crew scheduling constraints from historical schedules. Transportation Research Part C: Emerging Technologies, 26, 214-232.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Suraweera2013,
Title = {Learning crew scheduling constraints from historical schedules},
Author = {P. Suraweera and G.I. Webb and I. Evans and M. Wallace},
Journal = {Transportation Research Part C: Emerging Technologies},
Year = {2013},
Pages = {214 - 232},
Volume = {26},
Abstract = {For most airlines, there are numerous policies, agreements and regulations that govern the workload of airline crew. Although some constraints are formally documented, there are many others based on established practice and tacit understanding. Consequently, the task of developing a formal representation of the constraints that govern the working conditions of an airline’s crew requires extensive time and effort involving interviews with the airline’s crew schedulers and detailed analysis of historical schedules. We have developed a system that infers crew scheduling constraints from historical crew schedules with the assistance of a domain expert. This system implements the ComCon algorithm developed to learn constraints that prescribe the limits of certain aspects of crew schedules. The algorithm induces complex multivariate constraints based on a set of user provided templates that outline the general structure of important constraints. The results of an evaluation conducted with crew schedules from two commercial airlines show that the system is capable of learning the majority of the minimum rest constraints.},
Doi = {10.1016/j.trc.2012.08.002},
Keywords = {Engineering Applications},
Related = {engineering-applications}
}
ABSTRACT For most airlines, there are numerous policies, agreements and regulations that govern the workload of airline crew. Although some constraints are formally documented, there are many others based on established practice and tacit understanding. Consequently, the task of developing a formal representation of the constraints that govern the working conditions of an airline’s crew requires extensive time and effort involving interviews with the airline’s crew schedulers and detailed analysis of historical schedules. We have developed a system that infers crew scheduling constraints from historical crew schedules with the assistance of a domain expert. This system implements the ComCon algorithm developed to learn constraints that prescribe the limits of certain aspects of crew schedules. The algorithm induces complex multivariate constraints based on a set of user provided templates that outline the general structure of important constraints. The results of an evaluation conducted with crew schedules from two commercial airlines show that the system is capable of learning the majority of the minimum rest constraints.

Zaidi, N. A., Cerquides, J., Carman, M. J., & Webb, G. I. (2013). Alleviating Naive Bayes Attribute Independence Assumption by Attribute Weighting. Journal of Machine Learning Research, 14, 1947-1988.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{Zaidi2013,
Title = {Alleviating Naive Bayes Attribute Independence Assumption by Attribute Weighting},
Author = {Nayyar A. Zaidi and Jesus Cerquides and Mark J. Carman and Geoffrey I. Webb},
Journal = {Journal of Machine Learning Research},
Year = {2013},
Pages = {1947-1988},
Volume = {14},
Abstract = {Despite the simplicity of the Naive Bayes classifier, it has continued to perform well against more sophisticated newcomers and has remained, therefore, of great interest to the machine learning community. Of numerous approaches to refining the naive Bayes classifier, attribute weighting has received less attention than it warrants. Most approaches, perhaps influenced by attribute weighting in other machine learning algorithms, use weighting to place more emphasis on highly predictive attributes than those that are less predictive. In this paper, we argue that for naive Bayes attribute weighting should instead be used to alleviate the conditional independence assumption. Based on this premise, we propose a weighted naive Bayes algorithm, called WANBIA, that selects weights to minimize either the negative conditional log likelihood or the mean squared error objective functions. We perform extensive evaluations and find that WANBIA is a competitive alternative to state of the art classifiers like Random Forest, Logistic Regression and A1DE.},
Keywords = {Conditional Probability Estimation and WANBIA},
Related = {combining-generative-and-discriminative-learning},
Url = {http://jmlr.org/papers/volume14/zaidi13a/zaidi13a.pdf},
Urltext = {Link to paper on JMLR site}
}
ABSTRACT Despite the simplicity of the Naive Bayes classifier, it has continued to perform well against more sophisticated newcomers and has remained, therefore, of great interest to the machine learning community. Of numerous approaches to refining the naive Bayes classifier, attribute weighting has received less attention than it warrants. Most approaches, perhaps influenced by attribute weighting in other machine learning algorithms, use weighting to place more emphasis on highly predictive attributes than those that are less predictive. In this paper, we argue that for naive Bayes attribute weighting should instead be used to alleviate the conditional independence assumption. Based on this premise, we propose a weighted naive Bayes algorithm, called WANBIA, that selects weights to minimize either the negative conditional log likelihood or the mean squared error objective functions. We perform extensive evaluations and find that WANBIA is a competitive alternative to state of the art classifiers like Random Forest, Logistic Regression and A1DE.

Song, J., Tan, H., Perry, A. J., Akutsu, T., I., W. G. I., Whisstock, J. C., & Pike, R. N. (2012). PROSPER: An Integrated Feature-Based Tool for Predicting Protease Substrate Cleavage Sites. PLoS ONE, 7(11), e50300.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl12b,
Title = {PROSPER: An Integrated Feature-Based Tool for Predicting Protease Substrate Cleavage Sites},
Author = {J. Song and H. Tan and A.J. Perry and T. Akutsu and G.I. Webb I. and J.C. Whisstock and R.N. Pike},
Journal = {PLoS ONE},
Year = {2012},
Month = {11},
Number = {11},
Pages = {e50300},
Volume = {7},
Abstract = {<p>The ability to catalytically cleave protein substrates after synthesis is fundamental for all forms of life. Accordingly, site-specific proteolysis is one of the most important post-translational modifications. The key to understanding the physiological role of a protease is to identify its natural substrate(s). Knowledge of the substrate specificity of a protease can dramatically improve our ability to predict its target protein substrates, but this information must be utilized in an effective manner in order to efficiently identify protein substrates by <italic>in silico</italic> approaches. To address this problem, we present PROSPER, an integrated feature-based server for <italic>in silico</italic> identification of protease substrates and their cleavage sites for twenty-four different proteases. PROSPER utilizes established specificity information for these proteases (derived from the MEROPS database) with a machine learning approach to predict protease cleavage sites by using different, but complementary sequence and structure characteristics. Features used by PROSPER include local amino acid sequence profile, predicted secondary structure, solvent accessibility and predicted native disorder. Thus, for proteases with known amino acid specificity, PROSPER provides a convenient, pre-prepared tool for use in identifying protein substrates for the enzymes. Systematic prediction analysis for the twenty-four proteases thus far included in the database revealed that the features we have included in the tool strongly improve performance in terms of cleavage site prediction, as evidenced by their contribution to performance improvement in terms of identifying known cleavage sites in substrates for these enzymes. In comparison with two state-of-the-art prediction tools, PoPS and SitePrediction, PROSPER achieves greater accuracy and coverage. To our knowledge, PROSPER is the first comprehensive server capable of predicting cleavage sites of multiple proteases within a single substrate sequence using machine learning techniques. It is freely available at <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://lightning.med.monash.edu.au/PROSPER/" xlink:type="simple">http://lightning.med.monash.edu.au/PROSPER/</ext-link>.</p>},
Keywords = {Bioinformatics},
Publisher = {Public Library of Science},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1371%2Fjournal.pone.0050300}
}
ABSTRACT <p>The ability to catalytically cleave protein substrates after synthesis is fundamental for all forms of life. Accordingly, site-specific proteolysis is one of the most important post-translational modifications. The key to understanding the physiological role of a protease is to identify its natural substrate(s). Knowledge of the substrate specificity of a protease can dramatically improve our ability to predict its target protein substrates, but this information must be utilized in an effective manner in order to efficiently identify protein substrates by <italic>in silico</italic> approaches. To address this problem, we present PROSPER, an integrated feature-based server for <italic>in silico</italic> identification of protease substrates and their cleavage sites for twenty-four different proteases. PROSPER utilizes established specificity information for these proteases (derived from the MEROPS database) with a machine learning approach to predict protease cleavage sites by using different, but complementary sequence and structure characteristics. Features used by PROSPER include local amino acid sequence profile, predicted secondary structure, solvent accessibility and predicted native disorder. Thus, for proteases with known amino acid specificity, PROSPER provides a convenient, pre-prepared tool for use in identifying protein substrates for the enzymes. Systematic prediction analysis for the twenty-four proteases thus far included in the database revealed that the features we have included in the tool strongly improve performance in terms of cleavage site prediction, as evidenced by their contribution to performance improvement in terms of identifying known cleavage sites in substrates for these enzymes. In comparison with two state-of-the-art prediction tools, PoPS and SitePrediction, PROSPER achieves greater accuracy and coverage. To our knowledge, PROSPER is the first comprehensive server capable of predicting cleavage sites of multiple proteases within a single substrate sequence using machine learning techniques. It is freely available at <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://lightning.med.monash.edu.au/PROSPER/" xlink:type="simple">http://lightning.med.monash.edu.au/PROSPER/</ext-link>.</p>

Song, J., Tan, H., Wang, M., Webb, G. I., & Akutsu, T. (2012). TANGLE: Two-Level Support Vector Regression Approach for Protein Backbone Torsion Angle Prediction from Primary Sequences. PLoS ONE, 7(2), e30361.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl12,
Title = {TANGLE: Two-Level Support Vector Regression Approach for Protein Backbone Torsion Angle Prediction from Primary Sequences},
Author = {Song, Jiangning and Tan, Hao and Wang, Mingjun and Webb, Geoffrey I. and Akutsu, Tatsuya},
Journal = {PLoS ONE},
Year = {2012},
Month = {02},
Number = {2},
Pages = {e30361},
Volume = {7},
Abstract = {<p>Protein backbone torsion angles (Phi) and (Psi) involve two rotation angles rotating around the C<sub>α</sub>-N bond (Phi)
and the C<sub>α</sub>-C bond (Psi). Due to the planarity of the linked rigid peptide bonds, these two angles can essentially determine
the backbone geometry of proteins. Accordingly, the accurate prediction of protein backbone torsion angle from sequence information
can assist the prediction of protein structures. In this study, we develop a new approach called TANGLE (Torsion ANGLE predictor) to
predict the protein backbone torsion angles from amino acid sequences. TANGLE uses a two-level support vector regression approach to
perform real-value torsion angle prediction using a variety of features derived from amino acid sequences, including the evolutionary
profiles in the form of position-specific scoring matrices, predicted secondary structure, solvent accessibility and natively disordered
region as well as other global sequence features. When evaluated based on a large benchmark dataset of 1,526 non-homologous proteins,
the mean absolute errors (MAEs) of the Phi and Psi angle prediction are 27.8° and 44.6°, respectively, which are 1% and 3% respectively
lower than that using one of the state-of-the-art prediction tools ANGLOR. Moreover, the prediction of TANGLE is significantly better than a
random predictor that was built on the amino acid-specific basis, with the <italic>p</italic>-value&lt;1.46e-147 and 7.97e-150, respectively by the
Wilcoxon signed rank test. As a complementary approach to the current torsion angle prediction algorithms, TANGLE should prove useful in predicting
protein structural properties and assisting protein fold recognition by applying the predicted torsion angles as useful restraints. TANGLE is freely
accessible at <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/"
xlink:type="simple">http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/</ext-link>.</p>},
Doi = {10.1371/journal.pone.0030361},
Keywords = {Bioinformatics},
Publisher = {Public Library of Science},
Related = {computational-biology},
Url = {http://dx.doi.org/10.1371%2Fjournal.pone.0030361}
}
ABSTRACT <p>Protein backbone torsion angles (Phi) and (Psi) involve two rotation angles rotating around the C<sub>α</sub>-N bond (Phi) and the C<sub>α</sub>-C bond (Psi). Due to the planarity of the linked rigid peptide bonds, these two angles can essentially determine the backbone geometry of proteins. Accordingly, the accurate prediction of protein backbone torsion angle from sequence information can assist the prediction of protein structures. In this study, we develop a new approach called TANGLE (Torsion ANGLE predictor) to predict the protein backbone torsion angles from amino acid sequences. TANGLE uses a two-level support vector regression approach to perform real-value torsion angle prediction using a variety of features derived from amino acid sequences, including the evolutionary profiles in the form of position-specific scoring matrices, predicted secondary structure, solvent accessibility and natively disordered region as well as other global sequence features. When evaluated based on a large benchmark dataset of 1,526 non-homologous proteins, the mean absolute errors (MAEs) of the Phi and Psi angle prediction are 27.8° and 44.6°, respectively, which are 1% and 3% respectively lower than that using one of the state-of-the-art prediction tools ANGLOR. Moreover, the prediction of TANGLE is significantly better than a random predictor that was built on the amino acid-specific basis, with the <italic>p</italic>-value&lt;1.46e-147 and 7.97e-150, respectively by the Wilcoxon signed rank test. As a complementary approach to the current torsion angle prediction algorithms, TANGLE should prove useful in predicting protein structural properties and assisting protein fold recognition by applying the predicted torsion angles as useful restraints. TANGLE is freely accessible at <ext-link xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri" xlink:href="http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/" xlink:type="simple">http://sunflower.kuicr.kyoto-u.ac.jp/~sjn/TANGLE/</ext-link>.</p>

Martinez, A., Webb, G. I., Flores, M., & Gamez, J. (2012). Non-Disjoint Discretization for Aggregating One-Dependence Estimator Classifiers. Proceedings of the 7th International Conference on Hybrid Artificial Intelligent Systems, Berlin / Heidelberg, pp. 151-162.
[PDF] [Bibtex]  → Related papers and software

@InProceedings{MartinezEtAl12,
Title = {Non-Disjoint Discretization for Aggregating One-Dependence Estimator Classifiers},
Author = {A. Martinez and G. I. Webb and M. Flores and J. Gamez},
Booktitle = {Proceedings of the 7th International Conference on Hybrid Artificial Intelligent Systems},
Year = {2012},
Address = {Berlin / Heidelberg},
Pages = {151-162},
Publisher = {Springer},
ISBN = {978-3-642-28930-9},
Keywords = {Conditional Probability Estimation and AODE and discretization for naive bayes},
Related = {discretization-for-naive-bayes}
}
ABSTRACT 

Salem, H., Suraweera, P., Webb, G. I., & Boughton, J. R. (2012). Techniques for Efficient Learning without Search. Proceedings of the 16th Pacific-Asia Conference, PAKDD 2012, Berlin/Heidelberg, pp. 50-61.
[PDF] [URL] [Bibtex]  → Related papers and software

@InProceedings{SalemEtAl12,
Title = {Techniques for Efficient Learning without Search},
Author = {H. Salem and P. Suraweera and G.I. Webb and J.R. Boughton},
Booktitle = {Proceedings of the 16th {Pacific}-{Asia} Conference, PAKDD 2012},
Year = {2012},
Address = {Berlin/Heidelberg},
Pages = {50-61},
Publisher = {Springer},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Kuala Lumpur, Malaysia},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {http://link.springer.com/chapter/10.1007%2F978-3-642-30217-6_5}
}
ABSTRACT 

Mahmood, K., Webb, G. I., Song, J., Whisstock, J. C., & Konagurthu, A. S. (2012). Efficient large-scale protein sequence comparison and gene matching to identify orthologs and co-orthologs. Nucleic Acids Research, 40(6), e44.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{MahmoodEtAl2012,
Title = {Efficient large-scale protein sequence comparison and gene matching to identify orthologs and co-orthologs},
Author = {K. Mahmood and Webb, G.I. and Song, J. and Whisstock, J.C. and Konagurthu, A.S.},
Journal = {Nucleic Acids Research},
Year = {2012},
Number = {6},
Pages = {e44},
Volume = {40},
Abstract = {Broadly, computational approaches for ortholog assignment is a three steps process: (i) identify all putative homologs between the genomes, (ii) identify gene anchors and (iii) link anchors to identify best gene matches given their order and context. In this article, we engineer two methods to improve two important aspects of this pipeline [specifically steps (ii) and (iii)]. First, computing sequence similarity data [step (i)] is a computationally intensive task for large sequence sets, creating a bottleneck in the ortholog assignment pipeline. We have designed a fast and highly scalable sort-join method (afree) based on k-mer counts to rapidly compare all pairs of sequences in a large protein sequence set to identify putative homologs. Second, availability of complex genomes containing large gene families with prevalence of complex evolutionary events, such as duplications, has made the task of assigning orthologs and co-orthologs difficult. Here, we have developed an iterative graph matching strategy where at each iteration the best gene assignments are identified resulting in a set of orthologs and co-orthologs. We find that the afree algorithm is faster than existing methods and maintains high accuracy in identifying similar genes. The iterative graph matching strategy also showed high accuracy in identifying complex gene relationships. Standalone afree available from http://vbc.med.monash.edu.au/â.¼kmahmood/afree. EGM2, complete ortholog assignment pipeline (including afree and the iterative graph matching method) available from http://vbc.med.monash.edu.au/â.¼kmahmood/EGM2.},
Doi = {10.1093/nar/gkr1261},
Eprint = {http://nar.oxfordjournals.org/content/early/2011/12/29/nar.gkr1261.full.pdf+html},
Keywords = {Bioinformatics},
Publisher = {Oxford Journals},
Related = {computational-biology},
Url = {http://nar.oxfordjournals.org/content/early/2011/12/29/nar.gkr1261.abstract}
}
ABSTRACT Broadly, computational approaches for ortholog assignment is a three steps process: (i) identify all putative homologs between the genomes, (ii) identify gene anchors and (iii) link anchors to identify best gene matches given their order and context. In this article, we engineer two methods to improve two important aspects of this pipeline [specifically steps (ii) and (iii)]. First, computing sequence similarity data [step (i)] is a computationally intensive task for large sequence sets, creating a bottleneck in the ortholog assignment pipeline. We have designed a fast and highly scalable sort-join method (afree) based on k-mer counts to rapidly compare all pairs of sequences in a large protein sequence set to identify putative homologs. Second, availability of complex genomes containing large gene families with prevalence of complex evolutionary events, such as duplications, has made the task of assigning orthologs and co-orthologs difficult. Here, we have developed an iterative graph matching strategy where at each iteration the best gene assignments are identified resulting in a set of orthologs and co-orthologs. We find that the afree algorithm is faster than existing methods and maintains high accuracy in identifying similar genes. The iterative graph matching strategy also showed high accuracy in identifying complex gene relationships. Standalone afree available from http://vbc.med.monash.edu.au/â.¼kmahmood/afree. EGM2, complete ortholog assignment pipeline (including afree and the iterative graph matching method) available from http://vbc.med.monash.edu.au/â.¼kmahmood/EGM2.

Webb, G. I., Boughton, J., Zheng, F., Ting, K. M., & Salem, H. (2012). Learning by extrapolation from marginal to full-multivariate probability distributions: Decreasingly naive Bayesian classification. Machine Learning, 86(2), 233-272.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbEtAl12,
Title = {Learning by extrapolation from marginal to full-multivariate probability distributions: Decreasingly naive {Bayesian} classification},
Author = {G.I. Webb and J. Boughton and F. Zheng and K.M. Ting and H. Salem},
Journal = {Machine Learning},
Year = {2012},
Number = {2},
Pages = {233-272},
Volume = {86},
Abstract = {Averaged n-Dependence Estimators (AnDE) is an approach to probabilistic classification learning that learns by extrapolation from marginal
to full-multivariate probability distributions. It utilizes a single parameter that transforms the approach between a low-variance high-bias learner
(Naive Bayes) and a high-variance low-bias learner with Bayes optimal
asymptotic error. It extends the underlying strategy of Averaged One-Dependence Estimators (AODE), which relaxes the Naive Bayes independence assumption while retaining many of Naive Bayes' desirable computational and theoretical properties. AnDE further relaxes the independence assumption by generalizing AODE to higher-levels of dependence.
Extensive experimental evaluation shows that the bias-variance trade-off
for Averaged 2-Dependence Estimators results in strong predictive accuracy over a wide range of data sets. It has training time linear with
respect to the number of examples, supports incremental learning, handles directly missing values, and is robust in the face of noise. Beyond
the practical utility of its lower-dimensional variants, AnDE is of interest
in that it demonstrates that it is possible to create low-bias high-variance
generative learners and suggests strategies for developing even more powerful classifiers.},
Address = {Netherlands},
ISSN = {0885-6125},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {http://dx.doi.org/10.1007/s10994-011-5263-6},
Urltext = {Link to paper via SpringerLink}
}
ABSTRACT Averaged n-Dependence Estimators (AnDE) is an approach to probabilistic classification learning that learns by extrapolation from marginal to full-multivariate probability distributions. It utilizes a single parameter that transforms the approach between a low-variance high-bias learner (Naive Bayes) and a high-variance low-bias learner with Bayes optimal asymptotic error. It extends the underlying strategy of Averaged One-Dependence Estimators (AODE), which relaxes the Naive Bayes independence assumption while retaining many of Naive Bayes' desirable computational and theoretical properties. AnDE further relaxes the independence assumption by generalizing AODE to higher-levels of dependence. Extensive experimental evaluation shows that the bias-variance trade-off for Averaged 2-Dependence Estimators results in strong predictive accuracy over a wide range of data sets. It has training time linear with respect to the number of examples, supports incremental learning, handles directly missing values, and is robust in the face of noise. Beyond the practical utility of its lower-dimensional variants, AnDE is of interest in that it demonstrates that it is possible to create low-bias high-variance generative learners and suggests strategies for developing even more powerful classifiers.

Zheng, F., Webb, G. I., Suraweera, P., & Zhu, L. (2012). Subsumption Resolution: An Efficient and Effective Technique for Semi-Naive Bayesian Learning. Machine Learning, 87(1), 93-125.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ZhengEtAl12,
Title = {Subsumption Resolution: An Efficient and Effective Technique for Semi-Naive Bayesian Learning},
Author = {F. Zheng and G.I. Webb and P. Suraweera and L. Zhu},
Journal = {Machine Learning},
Year = {2012},
Number = {1},
Pages = {93-125},
Volume = {87},
Abstract = {Semi-naive Bayesian techniques seek to improve the accuracy of naive
Bayes (NB) by relaxing the attribute independence assumption. We present a new
type of semi-naive Bayesian operation, Subsumption Resolution (SR), which efficiently identifies occurrences of the specialization-generalization relationship and
eliminates generalizations at classification time.We extend SR to Near-Subsumption
Resolution (NSR) to delete near.generalizations in addition to generalizations. We
develop two versions of SR: one that performs SR during training, called eager SR
(ESR), and another that performs SR during testing, called lazy SR (LSR).We inves-
tigate the effect of ESR, LSR, NSR and conventional attribute elimination (BSE) on
NB and Averaged One-Dependence Estimators (AODE), a powerful alternative to
NB. BSE imposes very high training time overheads on NB and AODE accompanied
by varying decreases in classification time overheads. ESR, LSR and NSR impose
high training time and test time overheads on NB. However, LSR imposes no extra
training time overheads and only modest test time overheads on AODE, while ESR
and NSR impose modest training and test time overheads on AODE. Our extensive
experimental comparison on sixty UCI data sets shows that applying BSE, LSR or
NSR to NB significantly improves both zero-one loss and RMSE, while applying
BSE, ESR or NSR to AODE significantly improves zero-one loss and RMSE and
applying LSR to AODE significantly improves zero-one loss. The Friedman test and
Nemenyi test show that AODE with ESR or NSR have a significant zero-one loss and
RMSE advantage over Logistic Regression and a zero-one loss advantage overWeka.s
LibSVM implementation with a grid parameter search on categorical data. AODE
with LSR has a zero-one loss advantage over Logistic Regression and comparable
zero-one loss with LibSVM. Finally, we examine the circumstances under which the
elimination of near-generalizations proves beneficial.},
Address = {Netherlands},
Doi = {10.1007/s10994-011-5275-2},
ISSN = {0885-6125},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data},
Urltext = {Link to paper via SpringerLink}
}
ABSTRACT Semi-naive Bayesian techniques seek to improve the accuracy of naive Bayes (NB) by relaxing the attribute independence assumption. We present a new type of semi-naive Bayesian operation, Subsumption Resolution (SR), which efficiently identifies occurrences of the specialization-generalization relationship and eliminates generalizations at classification time.We extend SR to Near-Subsumption Resolution (NSR) to delete near.generalizations in addition to generalizations. We develop two versions of SR: one that performs SR during training, called eager SR (ESR), and another that performs SR during testing, called lazy SR (LSR).We inves- tigate the effect of ESR, LSR, NSR and conventional attribute elimination (BSE) on NB and Averaged One-Dependence Estimators (AODE), a powerful alternative to NB. BSE imposes very high training time overheads on NB and AODE accompanied by varying decreases in classification time overheads. ESR, LSR and NSR impose high training time and test time overheads on NB. However, LSR imposes no extra training time overheads and only modest test time overheads on AODE, while ESR and NSR impose modest training and test time overheads on AODE. Our extensive experimental comparison on sixty UCI data sets shows that applying BSE, LSR or NSR to NB significantly improves both zero-one loss and RMSE, while applying BSE, ESR or NSR to AODE significantly improves zero-one loss and RMSE and applying LSR to AODE significantly improves zero-one loss. The Friedman test and Nemenyi test show that AODE with ESR or NSR have a significant zero-one loss and RMSE advantage over Logistic Regression and a zero-one loss advantage overWeka.s LibSVM implementation with a grid parameter search on categorical data. AODE with LSR has a zero-one loss advantage over Logistic Regression and comparable zero-one loss with LibSVM. Finally, we examine the circumstances under which the elimination of near-generalizations proves beneficial.

Ng, N. M., Pierce, J. D., Webb, G. I., Ratnikov, B. I., Wijeyewickrema, L. C., Duncan, R. C., Robertson, A. L., Bottomley, S. P., Boyd, S. E., & Pike, R. N. (2011). Discovery of Amino Acid Motifs for Thrombin Cleavage and Validation Using a Model Substrate. Biochemistry, 50(48), 10499-10507.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{NgEtAl11,
Title = {Discovery of Amino Acid Motifs for Thrombin Cleavage and Validation Using a Model Substrate},
Author = {N.M. Ng and Pierce, J.D. and Webb, G.I. and Ratnikov, B.I. and Wijeyewickrema, L.C. and Duncan, R.C. and Robertson, A.L. and Bottomley, S.P. and Boyd, S.E. and Pike, R.N.},
Journal = {Biochemistry},
Year = {2011},
Number = {48},
Pages = {10499-10507},
Volume = {50},
Abstract = {Understanding the active site preferences of an enzyme is critical to the design of effective inhibitors and to gaining insights into its mechanisms of action on substrates. While the subsite specificity of thrombin is understood, it is not clear whether the enzyme prefers individual amino acids at each subsite in isolation or prefers to cleave combinations of amino acids as a motif. To investigate whether preferred peptide motifs for cleavage could be identified for thrombin, we exposed a phage-displayed peptide library to thrombin. The resulting preferentially cleaved substrates were analyzed using the technique of association rule discovery. The results revealed that thrombin selected for amino acid motifs in cleavage sites. The contribution of these hypothetical motifs to substrate cleavage efficiency was further investigated using the B1 IgG-binding domain of streptococcal protein G as a model substrate. Introduction of a P2.P1. LRS thrombin cleavage sequence within a major loop of the protein led to cleavage of the protein by thrombin, with the cleavage efficiency increasing with the length of the loop. Introduction of further P3.P1 and P1.P1..P3. amino acid motifs into the loop region yielded greater cleavage efficiencies, suggesting that the susceptibility of a protein substrate to cleavage by thrombin is influenced by these motifs, perhaps because of cooperative effects between subsites closest to the scissile peptide bond.},
Doi = {10.1021/bi201333g},
Eprint = {http://pubs.acs.org/doi/pdf/10.1021/bi201333g},
Keywords = {Bioinformatics},
Related = {computational-biology},
Url = {http://pubs.acs.org/doi/abs/10.1021/bi201333g}
}
ABSTRACT Understanding the active site preferences of an enzyme is critical to the design of effective inhibitors and to gaining insights into its mechanisms of action on substrates. While the subsite specificity of thrombin is understood, it is not clear whether the enzyme prefers individual amino acids at each subsite in isolation or prefers to cleave combinations of amino acids as a motif. To investigate whether preferred peptide motifs for cleavage could be identified for thrombin, we exposed a phage-displayed peptide library to thrombin. The resulting preferentially cleaved substrates were analyzed using the technique of association rule discovery. The results revealed that thrombin selected for amino acid motifs in cleavage sites. The contribution of these hypothetical motifs to substrate cleavage efficiency was further investigated using the B1 IgG-binding domain of streptococcal protein G as a model substrate. Introduction of a P2.P1. LRS thrombin cleavage sequence within a major loop of the protein led to cleavage of the protein by thrombin, with the cleavage efficiency increasing with the length of the loop. Introduction of further P3.P1 and P1.P1..P3. amino acid motifs into the loop region yielded greater cleavage efficiencies, suggesting that the susceptibility of a protein substrate to cleavage by thrombin is influenced by these motifs, perhaps because of cooperative effects between subsites closest to the scissile peptide bond.

Song, J., Tan, H., Boyd, S. E., Shen, H., Mahmood, K., Webb, G. I., Akutsu, T., Whisstock, J. C., & Pike, R. N. (2011). Bioinformatic Approaches for Predicting Substrates of Proteases. Journal of Bioinformatics and Computational Biology, 9(1), 149-178.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl11,
Title = {Bioinformatic Approaches for Predicting Substrates of Proteases},
Author = {J. Song and H. Tan and S.E. Boyd and H. Shen and K. Mahmood and G.I. Webb and T. Akutsu and J.C. Whisstock and R.N. Pike},
Journal = {Journal of Bioinformatics and Computational Biology},
Year = {2011},
Number = {1},
Pages = {149-178},
Volume = {9},
Abstract = {Proteases have central roles in "life and death" processes due to their important ability to catalytically hydrolyse protein substrates, usually altering the function and/or activity of the target in the process. Knowledge of the substrate specificity of a protease should, in theory, dramatically improve the ability to predict target protein substrates. However, experimental identification and characterization of protease substrates is often difficult and time-consuming. Thus solving the "substrate identification" problem is fundamental to both understanding protease biology and the development of therapeutics that target specific protease-regulated pathways. In this context, bioinformatic prediction of protease substrates may provide useful and experimentally testable information about novel potential cleavage sites in candidate substrates. In this article, we provide an overview of recent advances in developing bioinformatic approaches for predicting protease substrate cleavage sites and identifying novel putative substrates. We discuss the advantages and drawbacks of the current methods and detail how more accurate models can be built by deriving multiple sequence and structural features of substrates. We also provide some suggestions about how future studies might further improve the accuracy of protease substrate specificity prediction.},
Audit-trail = {http://www.worldscinet.com/jbcb/00/0001/S0219720011005288.html},
Doi = {10.1142/S0219720011005288},
Keywords = {Bioinformatics},
Publisher = {World Scientific},
Related = {computational-biology}
}
ABSTRACT Proteases have central roles in "life and death" processes due to their important ability to catalytically hydrolyse protein substrates, usually altering the function and/or activity of the target in the process. Knowledge of the substrate specificity of a protease should, in theory, dramatically improve the ability to predict target protein substrates. However, experimental identification and characterization of protease substrates is often difficult and time-consuming. Thus solving the "substrate identification" problem is fundamental to both understanding protease biology and the development of therapeutics that target specific protease-regulated pathways. In this context, bioinformatic prediction of protease substrates may provide useful and experimentally testable information about novel potential cleavage sites in candidate substrates. In this article, we provide an overview of recent advances in developing bioinformatic approaches for predicting protease substrate cleavage sites and identifying novel putative substrates. We discuss the advantages and drawbacks of the current methods and detail how more accurate models can be built by deriving multiple sequence and structural features of substrates. We also provide some suggestions about how future studies might further improve the accuracy of protease substrate specificity prediction.

Webb, G. I. (2011). Filtered-top-k Association Discovery. WIREs Data Mining and Knowledge Discovery, 1(3), 183-192.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb11,
Title = {Filtered-top-k Association Discovery},
Author = {G.I. Webb},
Journal = {WIREs Data Mining and Knowledge Discovery},
Year = {2011},
Number = {3},
Pages = {183-192},
Volume = {1},
Abstract = {Association mining has been one of the most intensively researched areas of data mining. However, direct uptake of the resulting technologies has been relatively low. This paper examines some of the reasons why the dominant paradigms in association mining have not lived up to their promise, and argues that a powerful alternative is provided by top-k techniques coupled with appropriate statistical and other filtering.},
Doi = {10.1002/widm.28},
Keywords = {Association Rule Discovery and statistically sound discovery},
Publisher = {Wiley},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT Association mining has been one of the most intensively researched areas of data mining. However, direct uptake of the resulting technologies has been relatively low. This paper examines some of the reasons why the dominant paradigms in association mining have not lived up to their promise, and argues that a powerful alternative is provided by top-k techniques coupled with appropriate statistical and other filtering.

Ting, K. M., Wells, J., Tan, S., Teng, S., & Webb, G. I. (2011). Feature-subspace aggregating: Ensembles for stable and unstable learners. Machine Learning, 82(3), 375-397.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{TingEtAl11,
Title = {Feature-subspace aggregating: Ensembles for stable and unstable learners},
Author = {K.M. Ting and J. Wells and S. Tan and S. Teng and G.I. Webb},
Journal = {Machine Learning},
Year = {2011},
Number = {3},
Pages = {375-397},
Volume = {82},
Abstract = {This paper introduces a new ensemble approach, Feature-Subspace Aggregating (Feating), which builds local models instead of global models. Feating is a generic ensemble approach that can enhance the predictive performance of both stable and unstable learners. In contrast, most existing ensemble approaches can improve the predictive performance of unstable learners only. Our analysis shows that the new approach reduces the execution time to generate a model in an ensemble through an increased level of localisation in Feating. Our empirical evaluation shows that Feating performs significantly better than Boosting, Random Subspace and Bagging in terms of predictive accuracy, when a stable learner SVM is used as the base learner. The speed up achieved by Feating makes feasible SVM ensembles that would otherwise be infeasible for large data sets. When SVM is the preferred base learner, we show that Feating SVM performs better than Boosting decision trees and Random Forests. We further demonstrate that Feating also substantially reduces the error of another stable learner, k-nearest neighbour, and an unstable learner, decision tree.},
Address = {Netherlands},
Doi = {10.1007/s10994-010-5224-5},
ISSN = {0885-6125},
Keywords = {Feating and Multiboosting and Boosting and Bias-variance},
Publisher = {Springer},
Related = {feating},
Urltext = {Link to paper via SpringerLink}
}
ABSTRACT This paper introduces a new ensemble approach, Feature-Subspace Aggregating (Feating), which builds local models instead of global models. Feating is a generic ensemble approach that can enhance the predictive performance of both stable and unstable learners. In contrast, most existing ensemble approaches can improve the predictive performance of unstable learners only. Our analysis shows that the new approach reduces the execution time to generate a model in an ensemble through an increased level of localisation in Feating. Our empirical evaluation shows that Feating performs significantly better than Boosting, Random Subspace and Bagging in terms of predictive accuracy, when a stable learner SVM is used as the base learner. The speed up achieved by Feating makes feasible SVM ensembles that would otherwise be infeasible for large data sets. When SVM is the preferred base learner, we show that Feating SVM performs better than Boosting decision trees and Random Forests. We further demonstrate that Feating also substantially reduces the error of another stable learner, k-nearest neighbour, and an unstable learner, decision tree.

Webb, G. I., Liu, B., Zhang, C., Gunopulos, D., & Wu, X. (Ed). (2010). ICDM 2010, The 10th IEEE International Conference on Data Mining. IEEE Computer Society.
[URL] [Bibtex]

@Proceedings{WebbICDM2010,
Title = {ICDM 2010, The 10th {IEEE} International Conference on Data Mining},
Year = {2010},
Editor = {Webb, G.I. and Liu, B. and Zhang, C. and Gunopulos, D. and Wu, X.},
Publisher = {IEEE Computer Society},
Url = {https://www.computer.org/csdl/proceedings/icdm/2010/4256/00/index.html},
Urltext = {Link to proceedings}
}
ABSTRACT 

Song, J., Tan, H., Shen, H., Mahmood, K., Boyd, S. E., Webb, G. I., Akutsu, T., & Whisstock, J. C. (2010). Cascleave: Towards More Accurate Prediction of Caspase Substrate Cleavage Sites. Bioinformatics, 26(6), 752-760.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl10,
Title = {Cascleave: Towards More Accurate Prediction of Caspase Substrate Cleavage Sites},
Author = {J. Song and H. Tan and H. Shen and K. Mahmood and S.E. Boyd and G.I. Webb and T. Akutsu and J.C. Whisstock},
Journal = {Bioinformatics},
Year = {2010},
Number = {6},
Pages = {752-760},
Volume = {26},
Abstract = {Motivation: The caspase family of cysteine proteases play essential roles in key biological processes such as programmed cell death, differentiation, proliferation, necrosis and inflammation. The complete repertoire of caspase substrates remains to be fully characterized. Accordingly, systematic computational screening studies of caspase substrate cleavage sites may provide insight into the substrate specificity of caspases and further facilitating the discovery of putative novel substrates. Results: In this article we develop an approach (termed Cascleave) to predict both classical (i.e. following a P1 Asp) and non-typical caspase cleavage sites. When using local sequence-derived profiles, Cascleave successfully predicted 82.2% of the known substrate cleavage sites, with a Matthews correla tion coefficient (MCC) of 0.667. We found that prediction performance could be further improved by incorporating information such as predicted solvent accessibility and whether a cleavage sequence lies in a region that is most likely natively unstructured. Novel bi-profile Bayesian signatures were found to significantly improve the prediction performance and yielded the best performance with an overall accuracy of 87.6% and a MCC of 0.747, which is higher accuracy than published methods that essentially rely on amino acid sequence alone. It is anticipated that Cascleave will be a powerful tool for predicting novel substrate cleavage sites of caspases and shedding new insights on the unknown caspase-substrate interactivity relationship.},
Audit-trail = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/btq339v1},
Doi = {10.1093/bioinformatics/btq043},
Keywords = {Bioinformatics},
Publisher = {Oxford Univ Press},
Related = {computational-biology}
}
ABSTRACT Motivation: The caspase family of cysteine proteases play essential roles in key biological processes such as programmed cell death, differentiation, proliferation, necrosis and inflammation. The complete repertoire of caspase substrates remains to be fully characterized. Accordingly, systematic computational screening studies of caspase substrate cleavage sites may provide insight into the substrate specificity of caspases and further facilitating the discovery of putative novel substrates. Results: In this article we develop an approach (termed Cascleave) to predict both classical (i.e. following a P1 Asp) and non-typical caspase cleavage sites. When using local sequence-derived profiles, Cascleave successfully predicted 82.2% of the known substrate cleavage sites, with a Matthews correla tion coefficient (MCC) of 0.667. We found that prediction performance could be further improved by incorporating information such as predicted solvent accessibility and whether a cleavage sequence lies in a region that is most likely natively unstructured. Novel bi-profile Bayesian signatures were found to significantly improve the prediction performance and yielded the best performance with an overall accuracy of 87.6% and a MCC of 0.747, which is higher accuracy than published methods that essentially rely on amino acid sequence alone. It is anticipated that Cascleave will be a powerful tool for predicting novel substrate cleavage sites of caspases and shedding new insights on the unknown caspase-substrate interactivity relationship.

Mahmood, K., Konagurthu, A. S., Song, J., Buckle, A. M., Webb, G. I., & Whisstock, J. C. (2010). EGM: Encapsulated Gene-by-Gene Matching to Identify Gene Orthologs and Homologous Segments in Genomes. Bioinformatics, 26(17), 2076-2084.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{MahmoodEtAl10,
Title = {EGM: Encapsulated Gene-by-Gene Matching to Identify Gene Orthologs and Homologous Segments in Genomes},
Author = {K. Mahmood and A.S. Konagurthu and J. Song and A.M. Buckle and G.I. Webb and J.C. Whisstock},
Journal = {Bioinformatics},
Year = {2010},
Number = {17},
Pages = {2076-2084},
Volume = {26},
Abstract = {Motivation: Identification of functionally equivalent genes in different species is essential to understand the evolution of biological pathways and processes. At the same time, identification of strings of conserved orthologous genes helps identify complex genomic rearrangements across different organisms. Such an insight is particularly useful, for example, in the transfer of experimental results between different experimental systems such as Drosophila and mammals.
Results: Here we describe the Encapsulated Gene-by-gene Matching (EGM) approach, a method that employs a graph matching strategy to identify gene orthologs and conserved gene segments. Given a pair of genomes, EGM constructs a global gene match for all genes taking into account gene context and family information. The Hungarian method for identifying the maximum weight matching in bipartite graphs is employed, where the resulting matching reveals one-to-one correspondences between nodes (genes) in a manner that maximizes the gene similarity and context.
Conclusion: We tested our approach by performing several comparisons including a detailed Human v Mouse genome mapping. We find that the algorithm is robust and sensitive in detecting orthologs and conserved gene segments. EGM can sensitively detect rearrangements within large and small chromosomal segments. The EGM tool is fully automated and easy to use compared to other more complex methods that also require extensive manual intervention and input.
},
Audit-trail = {http://bioinformatics.oxfordjournals.org/cgi/content/abstract/26/6/752},
Doi = {10.1093/bioinformatics/btq339},
Keywords = {Bioinformatics},
Publisher = {Oxford Univ Press},
Related = {computational-biology}
}
ABSTRACT Motivation: Identification of functionally equivalent genes in different species is essential to understand the evolution of biological pathways and processes. At the same time, identification of strings of conserved orthologous genes helps identify complex genomic rearrangements across different organisms. Such an insight is particularly useful, for example, in the transfer of experimental results between different experimental systems such as Drosophila and mammals. Results: Here we describe the Encapsulated Gene-by-gene Matching (EGM) approach, a method that employs a graph matching strategy to identify gene orthologs and conserved gene segments. Given a pair of genomes, EGM constructs a global gene match for all genes taking into account gene context and family information. The Hungarian method for identifying the maximum weight matching in bipartite graphs is employed, where the resulting matching reveals one-to-one correspondences between nodes (genes) in a manner that maximizes the gene similarity and context. Conclusion: We tested our approach by performing several comparisons including a detailed Human v Mouse genome mapping. We find that the algorithm is robust and sensitive in detecting orthologs and conserved gene segments. EGM can sensitively detect rearrangements within large and small chromosomal segments. The EGM tool is fully automated and easy to use compared to other more complex methods that also require extensive manual intervention and input.

Webb, G. I. (2010). Self-Sufficient Itemsets: An Approach to Screening Potentially Interesting Associations Between Items. ACM Transactions on Knowledge Discovery from Data, 4, Art. no. 3.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb10,
Title = {Self-Sufficient Itemsets: An Approach to Screening Potentially Interesting Associations Between Items},
Author = {G.I. Webb},
Journal = {{ACM} Transactions on Knowledge Discovery from Data},
Year = {2010},
Volume = {4},
Abstract = {Self-sufficient itemsets are those whose frequency cannot explained solely by the frequency of either their subsets or of their
supersets. We argue that itemsets that are not
self-sufficient will often be of little interest to the data
analyst, as their frequency should be expected once that of the
itemsets on which their frequency depends is known. We present
statistical tests for statistically sound discovery of
self-sufficient itemsets, and computational techniques that allow
those tests to be applied as a post-processing step for any itemset
discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.},
Articlenumber = {3},
Issue = {1},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Publisher = {ACM},
Related = {filtered-top-k-association-discovery},
Url = {http://dl.acm.org/authorize?270473}
}
ABSTRACT Self-sufficient itemsets are those whose frequency cannot explained solely by the frequency of either their subsets or of their supersets. We argue that itemsets that are not self-sufficient will often be of little interest to the data analyst, as their frequency should be expected once that of the itemsets on which their frequency depends is known. We present statistical tests for statistically sound discovery of self-sufficient itemsets, and computational techniques that allow those tests to be applied as a post-processing step for any itemset discovery algorithm. We also present a measure for assessing the degree of potential interest in an itemset that complements these statistical measures.

Sammut, C., & Webb, G. I. (Ed). (2010). Encyclopedia of Machine Learning. Berlin: Springer.
[URL] [Bibtex]

@Book{SammutWebb10,
Title = {Encyclopedia of Machine Learning},
Editor = {Sammut, C. and Webb, G.I.},
Publisher = {Springer},
Year = {2010},
Address = {Berlin},
Url = {http://www.springer.com/us/book/9780387307688}
}
ABSTRACT 

Yang, Y., & Webb, G. I. (2009). Discretization for Naive-Bayes Learning: Managing Discretization Bias and Variance. Machine Learning, 74(1), 39-74.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{YangWebb09,
Title = {Discretization for Naive-Bayes Learning: Managing Discretization Bias and Variance},
Author = {Y. Yang and G.I. Webb},
Journal = {Machine Learning},
Year = {2009},
Number = {1},
Pages = {39-74},
Volume = {74},
Abstract = {Quantitative attributes are usually discretized in Naive-Bayes learning. We establish simple conditions under which discretization is equivalent to use of the true probability density function during naive-Bayes learning. The use of different discretization techniques can be expected to affect the classification bias and variance of generated naive-Bayes classifiers, effects we name discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error. In particular, we supply insights into managing discretization bias and variance by adjusting the number of intervals and the number of training instances contained in each interval. We accordingly propose proportional discretization and fixed frequency discretization, two efficient unsupervised discretization methods that are able to effectively manage discretization bias and variance. We evaluate our new techniques against four key discretization methods for naive-Bayes classifiers. The experimental results support our theoretical analyses by showing that with statistically significant frequency, naive-Bayes classifiers trained on data discretized by our new methods are able to achieve lower classification error than those trained on data discretized by current established discretization methods.},
Address = {Netherlands},
Audit-trail = {DOI 10.1007/s10994-008-5083-5},
Doi = {10.1007/s10994-008-5083-5},
Keywords = {Discretization for Naive Bayes and Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Quantitative attributes are usually discretized in Naive-Bayes learning. We establish simple conditions under which discretization is equivalent to use of the true probability density function during naive-Bayes learning. The use of different discretization techniques can be expected to affect the classification bias and variance of generated naive-Bayes classifiers, effects we name discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error. In particular, we supply insights into managing discretization bias and variance by adjusting the number of intervals and the number of training instances contained in each interval. We accordingly propose proportional discretization and fixed frequency discretization, two efficient unsupervised discretization methods that are able to effectively manage discretization bias and variance. We evaluate our new techniques against four key discretization methods for naive-Bayes classifiers. The experimental results support our theoretical analyses by showing that with statistically significant frequency, naive-Bayes classifiers trained on data discretized by our new methods are able to achieve lower classification error than those trained on data discretized by current established discretization methods.

Song, J., Tan, H., Mahmood, K., Law, R. H. P., Buckle, A. M., Webb, G. I., Akutsu, T., & Whisstock, J. C. (2009). Prodepth: Predict Residue Depth by Support Vector Regression Approach from Protein Sequences Only. PLoS ONE, 4(9), e7072.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SongEtAl09,
Title = {Prodepth: Predict Residue Depth by Support Vector Regression Approach from Protein Sequences Only},
Author = {J. Song and H. Tan and K. Mahmood and R.H.P. Law and A.M. Buckle and G.I. Webb and T. Akutsu and J.C. Whisstock},
Journal = {PLoS ONE},
Year = {2009},
Number = {9},
Pages = {e7072},
Volume = {4},
Abstract = {Residue depth (RD) is a solvent exposure measure that complements the information provided by conventional accessible surface area (ASA) and describes to what extent a residue is buried in the protein structure space. Previous studies have established that RD is correlated with several protein properties, such as protein stability, residue conservation and amino acid types. Accurate prediction of RD has many potentially important applications in the field of structural bioinformatics, for example, facilitating the identification of functionally important residues, or residues in the folding nucleus, or enzyme active sites from sequence information. In this work, we introduce an efficient approach that uses support vector regression to quantify the relationship between RD and protein sequence. We systematically investigated eight different sequence encoding schemes including both local and global sequence characteristics and examined their respective prediction performances. For the objective evaluation of our approach, we used 5-fold cross-validation to assess the prediction accuracies and showed that the overall best performance could be achieved with a correlation coefficient (CC) of 0.71 between the observed and predicted RD values and a root mean square error (RMSE) of 1.74, after incorporating the relevant multiple sequence features. The results suggest that residue depth could be reliably predicted solely from protein primary sequences: local sequence environments are the major determinants, while global sequence features could influence the prediction performance marginally. We highlight two examples as a comparison in order to illustrate the applicability of this approach. We also discuss the potential implications of this new structural parameter in the field of protein structure prediction and homology modeling. This method might prove to be a powerful tool for sequence analysis.},
Audit-trail = {http://www.plosone.org/article/info:doi/10.1371/journal.pone.0007072},
Doi = {10.1371/journal.pone.0007072},
Keywords = {Bioinformatics},
Publisher = {PLOS},
Related = {computational-biology}
}
ABSTRACT Residue depth (RD) is a solvent exposure measure that complements the information provided by conventional accessible surface area (ASA) and describes to what extent a residue is buried in the protein structure space. Previous studies have established that RD is correlated with several protein properties, such as protein stability, residue conservation and amino acid types. Accurate prediction of RD has many potentially important applications in the field of structural bioinformatics, for example, facilitating the identification of functionally important residues, or residues in the folding nucleus, or enzyme active sites from sequence information. In this work, we introduce an efficient approach that uses support vector regression to quantify the relationship between RD and protein sequence. We systematically investigated eight different sequence encoding schemes including both local and global sequence characteristics and examined their respective prediction performances. For the objective evaluation of our approach, we used 5-fold cross-validation to assess the prediction accuracies and showed that the overall best performance could be achieved with a correlation coefficient (CC) of 0.71 between the observed and predicted RD values and a root mean square error (RMSE) of 1.74, after incorporating the relevant multiple sequence features. The results suggest that residue depth could be reliably predicted solely from protein primary sequences: local sequence environments are the major determinants, while global sequence features could influence the prediction performance marginally. We highlight two examples as a comparison in order to illustrate the applicability of this approach. We also discuss the potential implications of this new structural parameter in the field of protein structure prediction and homology modeling. This method might prove to be a powerful tool for sequence analysis.

Liu, B., Yang, Y., Webb, G. I., & Boughton, J. (2009). A Comparative Study of Bandwidth Choice in Kernel Density Estimation for Naive Bayesian Classification. Proceedings of the 13th Pacific-Asia Conference, PAKDD 2009, Berlin/Heidelberg, pp. 302-313.
[PDF] [URL] [Bibtex]  → Related papers and software

@InProceedings{LiuYangWebbBoughton09,
Title = {A Comparative Study of Bandwidth Choice in Kernel Density Estimation for Naive Bayesian Classification},
Author = {B. Liu and Y. Yang and G.I. Webb and J. Boughton},
Booktitle = {Proceedings of the 13th {Pacific}-{Asia} Conference, PAKDD 2009},
Year = {2009},
Address = {Berlin/Heidelberg},
Pages = {302-313},
Publisher = {Springer},
Keywords = {Conditional Probability Estimation and AODE and Discretization for Naive Bayes},
Location = {Bangkok, Thailand},
Related = {discretization-for-naive-bayes},
Url = {http://link.springer.com/chapter/10.1007%2F978-3-642-01307-2_29}
}
ABSTRACT 

Novak, P., Lavrac, N., & Webb, G. I. (2009). Supervised Descriptive Rule Discovery: A Unifying Survey of Contrast Set, Emerging Pattern and Subgroup Mining. Journal of Machine Learning Research, 10, 377-403.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{NovakLavracWebb09,
Title = {Supervised Descriptive Rule Discovery: A Unifying Survey of Contrast Set, Emerging Pattern and Subgroup Mining},
Author = {P. Novak and N. Lavrac and G.I. Webb},
Journal = {Journal of Machine Learning Research},
Year = {2009},
Pages = {377-403},
Volume = {10},
Abstract = {This paper gives a survey of contrast set mining (CSM), emerging pattern mining (EPM), and subgroup discovery (SD) in a unifying framework named supervised descriptive rule discovery. While all these research areas aim at discovering patterns in the form of rules induced from labeled data, they use different terminology and task definitions, claim to have different goals, claim to use different rule learning heuristics, and use different means for selecting subsets of induced patterns. This paper contributes a novel understanding of these subareas of data mining by presenting a unified terminology, by explaining the apparent differences between the learning tasks as variants of a unique supervised descriptive rule discovery task and by exploring the apparent differences between the approaches. It also shows that various rule learning heuristics used in CSM, EPM and SD algorithms all aim at optimizing a trade off between rule coverage and precision. The commonalities (and differences) between the approaches are showcased on a selection of best known variants of CSM, EPM and SD algorithms. The paper also provides a critical survey of existing supervised descriptive rule discovery visualization methods.},
Keywords = {Association Rule Discovery and OPUS},
Related = {filtered-top-k-association-discovery},
Url = {http://www.jmlr.org/papers/volume10/kralj-novak09a/kralj-novak09a.pdf}
}
ABSTRACT This paper gives a survey of contrast set mining (CSM), emerging pattern mining (EPM), and subgroup discovery (SD) in a unifying framework named supervised descriptive rule discovery. While all these research areas aim at discovering patterns in the form of rules induced from labeled data, they use different terminology and task definitions, claim to have different goals, claim to use different rule learning heuristics, and use different means for selecting subsets of induced patterns. This paper contributes a novel understanding of these subareas of data mining by presenting a unified terminology, by explaining the apparent differences between the learning tasks as variants of a unique supervised descriptive rule discovery task and by exploring the apparent differences between the approaches. It also shows that various rule learning heuristics used in CSM, EPM and SD algorithms all aim at optimizing a trade off between rule coverage and precision. The commonalities (and differences) between the approaches are showcased on a selection of best known variants of CSM, EPM and SD algorithms. The paper also provides a critical survey of existing supervised descriptive rule discovery visualization methods.

Ting, K. M., Wells, J. R., Tan, S. C., Teng, S. W., & Webb, G. I. (2009). FaSS: Ensembles for Stable Learners. Proceedings of the 8th International Workshop on Multiple Classifier Systems, MCS 2009, Berlin, pp. 364-374.
[DOI] [Bibtex]  → Related papers and software

@InProceedings{TingEtAl09,
Title = {FaSS: Ensembles for Stable Learners},
Author = {K.M. Ting and
J.R. Wells and
S.C. Tan and
S.W. Teng and
G.I. Webb},
Booktitle = {Proceedings of the 8th International Workshop on Multiple Classifier Systems, MCS 2009},
Year = {2009},
Address = {Berlin},
Pages = {364-374},
Publisher = {Springer},
Doi = {10.1007/978-3-642-02326-2_37},
Keywords = {Feating and Multiboosting and Boosting},
Location = {Reykjavik, Iceland},
Related = {feating}
}
ABSTRACT 

Hui, B., Yang, Y., & Webb, G. I. (2009). Anytime Classification for a Pool of Instances. Machine Learning, 77(1), 61-102.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{HuiYangWebb09,
Title = {Anytime Classification for a Pool of Instances},
Author = {B. Hui and Y. Yang and G.I. Webb},
Journal = {Machine Learning},
Year = {2009},
Number = {1},
Pages = {61-102},
Volume = {77},
Abstract = {In many real-world applications of classification learning, such as
credit card transaction vetting or classification embedded in sensor
nodes, multiple instances simultaneously require classification under
computational resource constraints such as limited time or limited
battery capacity. In such a situation, available computational
resources should be allocated across the instances in order to
optimize the overall classification efficacy and efficiency. We
propose a novel anytime classification framework, Scheduling Anytime
Averaged Probabilistic Estimators (SAAPE), which is capable of
classifying a pool of instances, delivering accurate results whenever
interrupted and optimizing the collective classification
performance. Following the practice of our previous anytime
classification system AAPE, SAAPE runs a sequence of very efficient
Bayesian probabilistic classifiers to classify each single
instance. Furthermore, SAAPE implements seven alternative scheduling
schemes to decide which instance gets available computational
resources next such that a new classifier can be applied to refine its
classification. We formally present each scheduling scheme's
definition, rationale and time complexity. We conduct large-scale
experiments using 60 benchmark data sets and diversified statistical
tests to evaluate SAAPE's performance on zero-one loss classification
as well as on probability estimation. We analyze each scheduling
scheme's advantage and disadvantage according to both theoretical
understandings and empirical observations. Consequently we identify
effective scheduling schemes that enable SAAPE to accomplish accurate
anytime classification for a pool of instances.},
Address = {Netherlands},
Audit-trail = {http://dx.doi.org/10.1007/s10994-009-5118-6},
Doi = {10.1007/s10994-009-5118-6},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT In many real-world applications of classification learning, such as credit card transaction vetting or classification embedded in sensor nodes, multiple instances simultaneously require classification under computational resource constraints such as limited time or limited battery capacity. In such a situation, available computational resources should be allocated across the instances in order to optimize the overall classification efficacy and efficiency. We propose a novel anytime classification framework, Scheduling Anytime Averaged Probabilistic Estimators (SAAPE), which is capable of classifying a pool of instances, delivering accurate results whenever interrupted and optimizing the collective classification performance. Following the practice of our previous anytime classification system AAPE, SAAPE runs a sequence of very efficient Bayesian probabilistic classifiers to classify each single instance. Furthermore, SAAPE implements seven alternative scheduling schemes to decide which instance gets available computational resources next such that a new classifier can be applied to refine its classification. We formally present each scheduling scheme's definition, rationale and time complexity. We conduct large-scale experiments using 60 benchmark data sets and diversified statistical tests to evaluate SAAPE's performance on zero-one loss classification as well as on probability estimation. We analyze each scheduling scheme's advantage and disadvantage according to both theoretical understandings and empirical observations. Consequently we identify effective scheduling schemes that enable SAAPE to accomplish accurate anytime classification for a pool of instances.

Webb, G. I. (2008). Multi-Strategy Ensemble Learning, Ensembles of Bayesian Classifiers, and the Problem of False Discoveries. Proceedings of the Seventh Australasian Data Mining Conference (AusDM 2008), pp. 15.
[Bibtex]

@InProceedings{Webb08b,
Title = {Multi-Strategy Ensemble Learning, Ensembles of Bayesian Classifiers, and the Problem of False Discoveries},
Author = {G.I. Webb},
Booktitle = {Proceedings of the Seventh Australasian Data Mining Conference (AusDM 2008)},
Year = {2008},
Pages = {15},
Publisher = {Australian Computer Society},
Location = {Adelaide, Australia},
Notes = {Abstract}
}
ABSTRACT 

Webb, G. I. (2008). Layered Critical Values: A Powerful Direct-Adjustment Approach to Discovering Significant Patterns. Machine Learning, 71(2-3), 307-323 [Technical Note].
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb08,
Title = {Layered Critical Values: A Powerful Direct-Adjustment Approach to Discovering Significant Patterns},
Author = {G.I. Webb},
Journal = {Machine Learning},
Year = {2008},
Number = {2-3},
Pages = {307-323 [Technical Note]},
Volume = {71},
Abstract = {Standard pattern discovery techniques, such as association rules, suffer an extreme risk of finding very large numbers of spurious patterns for many knowledge discovery tasks. The direct-adjustment approach to controlling this risk applies a statistical test during the discovery process, using a critical value adjusted to take account of the size of the search space. However, a problem with the direct-adjustment strategy is that it may discard numerous true patterns. This paper investigates the assignment of different critical values to different areas of the search space as an approach to alleviating this problem, using a variant of a technique originally developed for other purposes. This approach is shown to be effective at increasing the number of discoveries while still maintaining strict control over the risk of false discoveries.},
Address = {Netherlands},
Audit-trail = {DOI 10.1007/s10994-008-5046-x},
Doi = {10.1007/s10994-008-5046-x},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Notes = {Technical Note},
Publisher = {Springer},
Related = {statistically-sound-association-discovery}
}
ABSTRACT Standard pattern discovery techniques, such as association rules, suffer an extreme risk of finding very large numbers of spurious patterns for many knowledge discovery tasks. The direct-adjustment approach to controlling this risk applies a statistical test during the discovery process, using a critical value adjusted to take account of the size of the search space. However, a problem with the direct-adjustment strategy is that it may discard numerous true patterns. This paper investigates the assignment of different critical values to different areas of the search space as an approach to alleviating this problem, using a variant of a technique originally developed for other purposes. This approach is shown to be effective at increasing the number of discoveries while still maintaining strict control over the risk of false discoveries.

Yang, Y., Webb, G. I., Cerquides, J., Korb, K., Boughton, J., & Ting, K-M. (2007). To Select or To Weigh: A Comparative Study of Linear Combination Schemes for SuperParent-One-Dependence Estimators. IEEE Transactions on Knowledge and Data Engineering (TKDE), 19(12), 1652-1665.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{YangWebbCerquideszKorbBoughtonTing07,
Title = {To Select or To Weigh: A Comparative Study of Linear Combination Schemes for SuperParent-One-Dependence Estimators},
Author = {Y. Yang and G.I. Webb and J. Cerquides and K. Korb and J. Boughton and K-M. Ting},
Journal = {{IEEE} Transactions on Knowledge and Data Engineering (TKDE)},
Year = {2007},
Number = {12},
Pages = {1652-1665},
Volume = {19},
Abstract = {We conduct a large-scale comparative study on linearly combining superparent-one-dependence estimators (SPODEs), a popular family of semi-naive Bayesian classifiers. Altogether 16 model selection and weighing schemes, 58 benchmark data sets, as well as various statistical tests are employed. This paper’s main contributions are three-fold. First, it formally presents each scheme’s definition, rationale and time complexity; and hence can serve as a comprehensive reference for researchers interested in ensemble learning. Second, it offers bias-variance analysis for each scheme’s classification error performance. Third, it identifies effective schemes that meet various needs in practice. This leads to accurate and fast classification algorithms with immediate and significant impact on real-world applications. Another important feature of our study is using a variety of statistical tests to evaluate multiple learning methods across multiple data sets.},
Address = {Los Alamitos, CA},
Doi = {10.1109/TKDE.2007.190650},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {{IEEE} Computer Society},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT We conduct a large-scale comparative study on linearly combining superparent-one-dependence estimators (SPODEs), a popular family of semi-naive Bayesian classifiers. Altogether 16 model selection and weighing schemes, 58 benchmark data sets, as well as various statistical tests are employed. This paper’s main contributions are three-fold. First, it formally presents each scheme’s definition, rationale and time complexity; and hence can serve as a comprehensive reference for researchers interested in ensemble learning. Second, it offers bias-variance analysis for each scheme’s classification error performance. Third, it identifies effective schemes that meet various needs in practice. This leads to accurate and fast classification algorithms with immediate and significant impact on real-world applications. Another important feature of our study is using a variety of statistical tests to evaluate multiple learning methods across multiple data sets.

Webb, G. I. (2007). Tenth Anniversary Edition Editorial. Data Mining and Knowledge Discovery, 15(1), 1-2.
[DOI] [Bibtex]

@Article{Webb07b,
Title = {Tenth Anniversary Edition Editorial},
Author = {G. I. Webb},
Journal = {Data Mining and Knowledge Discovery},
Year = {2007},
Number = {1},
Pages = {1-2},
Volume = {15},
Address = {Netherlands},
Doi = {10.1007/s10618-007-0075-9},
Publisher = {Springer}
}
ABSTRACT 

Yang, Y., Webb, G. I., Korb, K., & Ting, K-M. (2007). Classifying under Computational Resource Constraints: Anytime Classification Using Probabilistic Estimators. Machine Learning, 69(1), 35-53.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{YangWebbKorbTing07,
Title = {Classifying under Computational Resource Constraints: Anytime Classification Using Probabilistic Estimators},
Author = {Y. Yang and G.I. Webb and K. Korb and K-M. Ting},
Journal = {Machine Learning},
Year = {2007},
Number = {1},
Pages = {35-53},
Volume = {69},
Abstract = {In many online applications of machine learning, the computational resources available for classification will vary from time to time. Most techniques are designed to operate within the constraints of the minimum expected resources and fail to utilize further resources when they are available. We propose a novel anytime classification algorithm, anytime averaged probabilistic estimators (AAPE), which is capable of delivering strong prediction accuracy with little CPU time and utilizing additional CPU time to increase classification accuracy. The idea is to run an ordered sequence of very efficient Bayesian probabilistic estimators (single improvement steps) until classification time runs out. Theoretical studies and empirical validations reveal that by properly identifying, ordering, invoking and ensembling single improvement steps, AAPE is able to accomplish accurate classification whenever it is interrupted. It is also able to output class probability estimates beyond simple 0/1-loss classifications, as well as adeptly handle incremental learning.},
Address = {Netherlands},
Audit-trail = {DOI 10.1007/s10994-007-5020-z},
Doi = {10.1007/s10994-007-5020-z},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT In many online applications of machine learning, the computational resources available for classification will vary from time to time. Most techniques are designed to operate within the constraints of the minimum expected resources and fail to utilize further resources when they are available. We propose a novel anytime classification algorithm, anytime averaged probabilistic estimators (AAPE), which is capable of delivering strong prediction accuracy with little CPU time and utilizing additional CPU time to increase classification accuracy. The idea is to run an ordered sequence of very efficient Bayesian probabilistic estimators (single improvement steps) until classification time runs out. Theoretical studies and empirical validations reveal that by properly identifying, ordering, invoking and ensembling single improvement steps, AAPE is able to accomplish accurate classification whenever it is interrupted. It is also able to output class probability estimates beyond simple 0/1-loss classifications, as well as adeptly handle incremental learning.

Zheng, F., & Webb, G. I. (2007). Finding the Right Family: Parent and Child Selection for Averaged One-Dependence Estimators. Lecture Notes in Artificial Intelligence 4710: Proceedings of the 18th European Conference on Machine Learning (ECML'07), Berlin/Heidelberg, pp. 490-501.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb07,
Title = {Finding the Right Family: Parent and Child Selection for Averaged One-Dependence Estimators},
Author = {F. Zheng and G.I. Webb},
Booktitle = {Lecture Notes in Artificial Intelligence 4710: Proceedings of the 18th European Conference on Machine Learning (ECML'07)},
Year = {2007},
Address = {Berlin/Heidelberg},
Pages = {490-501},
Publisher = {Springer-Verlag},
Abstract = {Averaged One-Dependence Estimators (AODE) classifies by uniformly aggregating all qualified one-dependence estimators (ODEs). Its capacity to significantly improve naive Bayes' accuracy without undue time complexity has attracted substantial interest. Forward Sequential Selection and Backwards Sequential Elimination are effective wrapper techniques to identify and repair harmful interdependencies which have been profitably applied to naive Bayes. However, their straightforward application to AODE has previously proved ineffective. We investigate novel variants of these strategies. Our extensive experiments show that elimination of child attributes from within the constituent ODEs results in a significant improvement in probability estimate and reductions in bias and error relative to unmodified AODE. In contrast, elimination of complete constituent ODEs and the four types of attribute addition are found to be less effective and do not demonstrate any strong advantage over AODE. These surprising results lead to effective techniques for improving AODE's prediction accuracy.},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Warsaw, Poland},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Averaged One-Dependence Estimators (AODE) classifies by uniformly aggregating all qualified one-dependence estimators (ODEs). Its capacity to significantly improve naive Bayes' accuracy without undue time complexity has attracted substantial interest. Forward Sequential Selection and Backwards Sequential Elimination are effective wrapper techniques to identify and repair harmful interdependencies which have been profitably applied to naive Bayes. However, their straightforward application to AODE has previously proved ineffective. We investigate novel variants of these strategies. Our extensive experiments show that elimination of child attributes from within the constituent ODEs results in a significant improvement in probability estimate and reductions in bias and error relative to unmodified AODE. In contrast, elimination of complete constituent ODEs and the four types of attribute addition are found to be less effective and do not demonstrate any strong advantage over AODE. These surprising results lead to effective techniques for improving AODE's prediction accuracy.

Webb, G. I. (2007). Finding the Real Patterns (Extended Abstract). Lecture Notes in Computer Science Vol. 4426 : Advances in Knowledge Discovery and Data Mining Proceedings of the 11th Pacific-Asia Conference, PAKDD 2007, Berlin/Heidelberg, pp. 6.
[Bibtex]

@InProceedings{Webb07a,
Title = {Finding the Real Patterns (Extended Abstract)},
Author = {G.I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 4426 : Advances in Knowledge Discovery and Data Mining Proceedings of the 11th {Pacific}-{Asia} Conference, PAKDD 2007},
Year = {2007},
Address = {Berlin/Heidelberg},
Editor = {Zhi-Hua Zhou, Hang Li, Qiang Yang},
Pages = {6},
Publisher = {Springer},
Keywords = {opus},
Location = {Nanjing, China}
}
ABSTRACT 

Faux, N. G., Huttley, G. A., Mahmood, K., Webb, G. I., de la Banda, G. M., & Whisstock, J. C. (2007). RCPdb: An evolutionary classification and codon usage database for repeat-containing proteins. Genome Research, 17(1), 1118-1127.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{FauxHuttleyMahmoodWebbGarciaWhisstock07,
Title = {RCPdb: An evolutionary classification and codon usage database for repeat-containing proteins},
Author = {N.G. Faux and G.A. Huttley and K. Mahmood and G.I. Webb and M. Garcia de la Banda and J.C. Whisstock},
Journal = {Genome Research},
Year = {2007},
Number = {1},
Pages = {1118-1127},
Volume = {17},
Abstract = {Over 3% of human proteins contain single amino acid repeats (repeat-containing proteins, RCPs). Many repeats (homopeptides) localize to important proteins involved in transcription, and the expansion of certain repeats, in particular poly-Q and poly-A tracts, can also lead to the development of neurological diseases. Previous studies have suggested that the homopeptide makeup is a result of the presence of G+C-rich tracts in the encoding genes and that expansion occurs via replication slippage. Here, we have performed a large-scale genomic analysis of the variation of the genes encoding RCPs in 13 species and present these data in an online database (http://repeats.med.monash.edu.au/genetic_analysis/). This resource allows rapid comparison and analysis of RCPs, homopeptides, and their underlying genetic tracts across the eukaryotic species considered. We report three major findings. First, there is a bias for a small subset of codons being reiterated within homopeptides, and there is no G+C or A+T bias relative to the organism’s transcriptome. Second, single base pair transversions from the homocodon are unusually common and may represent a mechanism of reducing the rate of homopeptide mutations. Third, homopeptides that are conserved across different species lie within regions that are under stronger purifying selection in contrast to nonconserved homopeptides.},
Address = {Woodbury, New York},
Doi = {10.1101/gr.6255407},
Keywords = {Bioinformatics},
Publisher = {Cold Spring Harbor Laboratory Press, ISSN 1088-9051/07},
Related = {computational-biology}
}
ABSTRACT Over 3% of human proteins contain single amino acid repeats (repeat-containing proteins, RCPs). Many repeats (homopeptides) localize to important proteins involved in transcription, and the expansion of certain repeats, in particular poly-Q and poly-A tracts, can also lead to the development of neurological diseases. Previous studies have suggested that the homopeptide makeup is a result of the presence of G+C-rich tracts in the encoding genes and that expansion occurs via replication slippage. Here, we have performed a large-scale genomic analysis of the variation of the genes encoding RCPs in 13 species and present these data in an online database (http://repeats.med.monash.edu.au/genetic_analysis/). This resource allows rapid comparison and analysis of RCPs, homopeptides, and their underlying genetic tracts across the eukaryotic species considered. We report three major findings. First, there is a bias for a small subset of codons being reiterated within homopeptides, and there is no G+C or A+T bias relative to the organism’s transcriptome. Second, single base pair transversions from the homocodon are unusually common and may represent a mechanism of reducing the rate of homopeptide mutations. Third, homopeptides that are conserved across different species lie within regions that are under stronger purifying selection in contrast to nonconserved homopeptides.

Webb, G. I. (2007). Discovering Significant Patterns. Machine Learning, 68(1), 1-33.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb07,
Title = {Discovering Significant Patterns},
Author = {G.I. Webb},
Journal = {Machine Learning},
Year = {2007},
Number = {1},
Pages = {1-33},
Volume = {68},
Abstract = {Exploratory pattern discovery techniques, such as association rule discovery, explore large search spaces of potential patterns to find those that satisfy some user-specified constraints. Due to the large number of patterns considered, they suffer from an extreme risk of type-1 error, that is, of finding patterns that appear due to chance alone to satisfy the constraints on the sample data. This paper proposes techniques to overcome this problem by applying well-established statistical practices. These allow the user to enforce a strict upper limit on the risk of experimentwise error. Empirical studies demonstrate that standard exploratory pattern discovery techniques can discover numerous spurious patterns when applied to random data and when applied to real-world data result in large numbers of patterns that are rejected when subjected to statistical evaluation on holdout data. They also reveal that modification of the pattern discovery process to anticipate subsequent statistical evaluation can increase the number of patterns that are accepted by statistical evaluation on holdout data.},
Address = {Netherlands},
Audit-trail = {subject to revisions},
Doi = {10.1007/s10994-007-5006-x},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Publisher = {Springer},
Related = {statistically-sound-association-discovery}
}
ABSTRACT Exploratory pattern discovery techniques, such as association rule discovery, explore large search spaces of potential patterns to find those that satisfy some user-specified constraints. Due to the large number of patterns considered, they suffer from an extreme risk of type-1 error, that is, of finding patterns that appear due to chance alone to satisfy the constraints on the sample data. This paper proposes techniques to overcome this problem by applying well-established statistical practices. These allow the user to enforce a strict upper limit on the risk of experimentwise error. Empirical studies demonstrate that standard exploratory pattern discovery techniques can discover numerous spurious patterns when applied to random data and when applied to real-world data result in large numbers of patterns that are rejected when subjected to statistical evaluation on holdout data. They also reveal that modification of the pattern discovery process to anticipate subsequent statistical evaluation can increase the number of patterns that are accepted by statistical evaluation on holdout data.

Yang, Y., & Webb, G. I. (2006). Discretization for Data Mining. In Wang, J. (Ed.), In The Encyclopedia of Data Warehousing and Mining (, pp. 392-396). Hershey, PA: Idea Group Inc..
[DOI] [Bibtex]

@InCollection{YangWebb05,
Title = {Discretization for Data Mining},
Author = {Y. Yang and G. I. Webb},
Booktitle = {The Encyclopedia of Data Warehousing and Mining},
Publisher = {Idea Group Inc.},
Year = {2006},
Address = {Hershey, PA},
Editor = {John Wang },
Pages = {392-396},
Audit-trail = {August 04 Copyright signed. Ying handling submission. PDF not posted},
Doi = {10.4018/978-1-59140-557-3.ch075}
}
ABSTRACT 

Webb, G. I. (2006). Anytime Learning and Classification for Online Applications. Advances in Intelligent IT: Proceedings of the Fourth International Conference on Active Media Technology (AMT'06). [Extended Abstract], Amsterdam, pp. 7-12.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb06b,
Title = {Anytime Learning and Classification for Online Applications},
Author = {G.I. Webb},
Booktitle = {Advances in Intelligent IT: Proceedings of the Fourth International Conference on Active Media Technology (AMT'06). [Extended Abstract]},
Year = {2006},
Address = {Amsterdam},
Editor = {Y. Li and M. Looi and N. Zhong},
Pages = {7-12},
Publisher = {IOS Press},
Abstract = {Many online applications of machine learning require fast classification and hence utilise efficient classifiers such as naive Bayes. However, outside periods of peak computation load, additional computational resources will often be available. Anytime classification can use whatever computational resources may be available at classification time to improve the accuracy of the classifications made.},
Audit-trail = {ISSN 0922-6389},
Location = {Brisbane, Australia}
}
ABSTRACT Many online applications of machine learning require fast classification and hence utilise efficient classifiers such as naive Bayes. However, outside periods of peak computation load, additional computational resources will often be available. Anytime classification can use whatever computational resources may be available at classification time to improve the accuracy of the classifications made.

Yang, Q., & Webb, G. I. (Ed). (2006). Lecture Notes in Artificial Intelligence 4099: Proceedings of the 9th Pacific Rim International Conference on Artificial Intelligence (PRICAI 2006). Berlin: Springer.
[Bibtex]

@Proceedings{YangWebb06,
Title = {Lecture Notes in Artificial Intelligence 4099: Proceedings of the 9th {Pacific} Rim International Conference on Artificial Intelligence (PRICAI 2006)},
Year = {2006},
Address = {Berlin},
Editor = {Q. Yang and G. I. Webb},
Publisher = {Springer},
Series = {Lecture Notes in Artificial Intelligence},
Location = {Guilin, China}
}
ABSTRACT 

Huang, S., & Webb, G. I. (2006). Efficiently Identifying Exploratory Rules' Significance. LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications', Berlin/Heidelberg, pp. 64-77.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{HuangWebb05b,
Title = {Efficiently Identifying Exploratory Rules' Significance},
Author = {S. Huang and G.I. Webb},
Booktitle = {LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications'},
Year = {2006},
Address = {Berlin/Heidelberg},
Note = {An earlier version of this paper was published in S.J. Simoff and G.J. Williams (Eds.), Proceedings of the Third Australasian Data Mining Conference (AusDM04) Cairns, Australia. Sydney: University of Technology, pages 169-182.},
Pages = {64-77},
Publisher = {Springer},
Abstract = {How to efficiently discard potentially uninteresting rules in exploratory rule discovery is one of the important research foci in data mining. Many researchers have presented algorithms to automatically remove potentially uninteresting rules utilizing background knowledge and user-specified constraints. Identifying the significance of exploratory rules using a significance test is desirable for removing rules that may appear interesting by chance, hence providing the users with a more compact set of resulting rules. However, applying statistical tests to identify significant rules requires considerable computation and data access in order to obtain the necessary statistics. The situation gets worse as the size of the database increases. In this paper, we propose two approaches for improving the efficiency of significant exploratory rule discovery. We also evaluate the experimental effect in impact rule discovery which is suitable for discovering exploratory rules in very large, dense databases.},
Doi = {10.1007/11677437_6},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Related = {statistically-sound-association-discovery}
}
ABSTRACT How to efficiently discard potentially uninteresting rules in exploratory rule discovery is one of the important research foci in data mining. Many researchers have presented algorithms to automatically remove potentially uninteresting rules utilizing background knowledge and user-specified constraints. Identifying the significance of exploratory rules using a significance test is desirable for removing rules that may appear interesting by chance, hence providing the users with a more compact set of resulting rules. However, applying statistical tests to identify significant rules requires considerable computation and data access in order to obtain the necessary statistics. The situation gets worse as the size of the database increases. In this paper, we propose two approaches for improving the efficiency of significant exploratory rule discovery. We also evaluate the experimental effect in impact rule discovery which is suitable for discovering exploratory rules in very large, dense databases.

Webb, G. I., & Brain, D. (2006). Generality is Predictive of Prediction Accuracy. LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications', Berlin/Heidelberg, pp. 1-13.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbBrain05,
Title = {Generality is Predictive of Prediction Accuracy},
Author = {G.I. Webb and D. Brain},
Booktitle = {LNAI State-of-the-Art Survey series, 'Data Mining: Theory, Methodology, Techniques, and Applications'},
Year = {2006},
Address = {Berlin/Heidelberg},
Note = {An earlier version of this paper was published in the Proceedings of PKAW 2002, pp 117-130},
Pages = {1-13},
Publisher = {Springer},
Abstract = {During knowledge acquisition it frequently occurs that multiple alternative potential rules all appear equally credible. This paper addresses the dearth of formal analysis about how to select between such alternatives. It presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. We argue that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. We also argue that in comparison to the more general rule, the accuracy of the more specific rule on unseen cases will tend to be closer to the accuracy obtained on training data. Experimental evidence is provided in support of these hypotheses. These hypotheses can be useful for selecting between rules in order to achieve specific knowledge acquisition objectives.},
Keywords = {Generality},
Related = {generality-is-predictive-of-prediction-accuracy}
}
ABSTRACT During knowledge acquisition it frequently occurs that multiple alternative potential rules all appear equally credible. This paper addresses the dearth of formal analysis about how to select between such alternatives. It presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. We argue that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. We also argue that in comparison to the more general rule, the accuracy of the more specific rule on unseen cases will tend to be closer to the accuracy obtained on training data. Experimental evidence is provided in support of these hypotheses. These hypotheses can be useful for selecting between rules in order to achieve specific knowledge acquisition objectives.

Webb, G. I. (2006). Discovering Significant Rules. Proceedings of the Twelfth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2006), New York, pp. 434-443.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb06a,
Title = {Discovering Significant Rules},
Author = {G.I. Webb},
Booktitle = {Proceedings of the Twelfth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2006)},
Year = {2006},
Address = {New York},
Editor = {L. Ungar and M. Craven and D. Gunopulos and T. Eliassi-Rad},
Pages = {434 - 443},
Publisher = {The Association for Computing Machinery},
Abstract = {In many applications, association rules will only be interesting if they represent non-trivial correlations between all constituent items. Numerous techniques have been developed that seek to avoid false discoveries. However, while all provide useful solutions to aspects of this problem, none provides a generic solution that is both flexible enough to accommodate varying definitions of true and false discoveries and powerful enough to provide strict control over the risk of false discoveries. This paper presents generic techniques that allow definitions of true and false discoveries to be specified in terms of arbitrary statistical hypothesis tests and which provide strict control over the experimentwise risk of false discoveries.},
Keywords = {OPUS and Association Rule Discovery and statistically sound discovery},
Location = {Philadelphia, PA},
Related = {statistically-sound-association-discovery},
Url = {http://dl.acm.org/authorize?N00546}
}
ABSTRACT In many applications, association rules will only be interesting if they represent non-trivial correlations between all constituent items. Numerous techniques have been developed that seek to avoid false discoveries. However, while all provide useful solutions to aspects of this problem, none provides a generic solution that is both flexible enough to accommodate varying definitions of true and false discoveries and powerful enough to provide strict control over the risk of false discoveries. This paper presents generic techniques that allow definitions of true and false discoveries to be specified in terms of arbitrary statistical hypothesis tests and which provide strict control over the experimentwise risk of false discoveries.

Lu, J., Yang, Y., & Webb, G. I. (2006). Incremental Discretization for Naive-Bayes Classifier. Lecture Notes in Computer Science 4093: Proceedings of the Second International Conference on Advanced Data Mining and Applications (ADMA 2006), Berlin, pp. 223-238.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{LuYangWebb06,
Title = {Incremental Discretization for Naive-Bayes Classifier},
Author = {J. Lu and Y. Yang and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 4093: Proceedings of the Second International Conference on Advanced Data Mining and Applications (ADMA 2006)},
Year = {2006},
Address = {Berlin},
Editor = {Xue Li and Osmar R. Zaiane and Zhanhuai Li },
Pages = {223-238},
Publisher = {Springer},
Abstract = {Naive-Bayes classifiers (NB) support incremental learning. However, the lack of effective incremental discretization methods has been hindering NB's incremental learning in face of quantitative data. This problem is further compounded by the fact that quantitative data are everywhere, from temperature readings to share prices. In this paper, we present a novel incremental discretization method for NB, incremental flexible frequency discretization (IFFD). IFFD discretizes values of a quantitative attribute into a sequence of intervals of flexible sizes. It allows online insertion and splitting operation on intervals. Theoretical analysis and experimental test are conducted to compare IFFD with alternative methods. Empirical evidence suggests that IFFD is efficient and effective. NB coupled with IFFD achieves a rapport between high learning efficiency and high classification accuracy in the context of incremental learning.},
Keywords = {Conditional Probability Estimation and Discretization for Naive Bayes and Incremental Learning and Stream Mining},
Location = {XiÆan, China},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Naive-Bayes classifiers (NB) support incremental learning. However, the lack of effective incremental discretization methods has been hindering NB's incremental learning in face of quantitative data. This problem is further compounded by the fact that quantitative data are everywhere, from temperature readings to share prices. In this paper, we present a novel incremental discretization method for NB, incremental flexible frequency discretization (IFFD). IFFD discretizes values of a quantitative attribute into a sequence of intervals of flexible sizes. It allows online insertion and splitting operation on intervals. Theoretical analysis and experimental test are conducted to compare IFFD with alternative methods. Empirical evidence suggests that IFFD is efficient and effective. NB coupled with IFFD achieves a rapport between high learning efficiency and high classification accuracy in the context of incremental learning.

Butler, S., & Webb, G. I. (2006). Mining Group Differences. In Wang, J. (Ed.), In The Encyclopedia of Data Warehousing and Mining (, pp. 795-799). Hershey, PA: Idea Group Inc..
[DOI] [Bibtex]

@InCollection{ButlerWebb05,
Title = {Mining Group Differences},
Author = {S. Butler and G. I. Webb},
Booktitle = {The Encyclopedia of Data Warehousing and Mining},
Publisher = {Idea Group Inc.},
Year = {2006},
Address = {Hershey, PA},
Editor = {John Wang },
Pages = {795-799},
Audit-trail = {August 04 Copyright signed. Shane handling submission. PDF not posted},
Doi = {10.4018/978-1-60566-010-3.ch199},
Keywords = {Association Rule Discovery}
}
ABSTRACT 

Yang, Y., Webb, G. I., Cerquides, J., Korb, K., Boughton, J., & Ting, K-M. (2006). To Select or To Weigh: A Comparative Study of Model Selection and Model Weighing for SPODE Ensembles. Lecture Notes in Computer Science 4212: Proceedings of the 17th European Conference on Machine Learning (ECML'06), Berlin/Heidelberg, pp. 533-544.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebbCerquideKorbBoughtonTing06,
Title = {To Select or To Weigh: A Comparative Study of Model Selection and Model Weighing for SPODE Ensembles},
Author = {Y. Yang and G.I. Webb and J. Cerquides and K. Korb and J. Boughton and K-M. Ting},
Booktitle = {Lecture Notes in Computer Science 4212: Proceedings of the 17th European Conference on Machine Learning (ECML'06)},
Year = {2006},
Address = {Berlin/Heidelberg},
Editor = {J. Furkranz and T. Scheffer and M. Spiliopoulou},
Pages = {533-544},
Publisher = {Springer-Verlag},
Abstract = {An ensemble of Super-Parent-One-Dependence Estimators (SPODEs) offers a powerful yet simple alternative to naive Bayes classifiers, achieving significantly higher classification accuracy at a moderate cost in classification efficiency. Currently there exist two families of methodologies that ensemble candidate SPODEs for classification. One is to select only helpful SPODEs and uniformly average their probability estimates, a type of model selection. Another is to assign a weight to each SPODE and linearly combine their probability estimates, a methodology named model weighing. This paper presents a theoretical and empirical study comparing model selection and model weighing for ensembling SPODEs. The focus is on maximizing the ensemble's classification accuracy while minimizing its computational time. A number of representative selection and weighing schemes are studied, providing a comprehensive research on this topic and identifying effective schemes that provide alternative trades-offs between speed and expected error},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Berlin, Germany},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT An ensemble of Super-Parent-One-Dependence Estimators (SPODEs) offers a powerful yet simple alternative to naive Bayes classifiers, achieving significantly higher classification accuracy at a moderate cost in classification efficiency. Currently there exist two families of methodologies that ensemble candidate SPODEs for classification. One is to select only helpful SPODEs and uniformly average their probability estimates, a type of model selection. Another is to assign a weight to each SPODE and linearly combine their probability estimates, a methodology named model weighing. This paper presents a theoretical and empirical study comparing model selection and model weighing for ensembling SPODEs. The focus is on maximizing the ensemble's classification accuracy while minimizing its computational time. A number of representative selection and weighing schemes are studied, providing a comprehensive research on this topic and identifying effective schemes that provide alternative trades-offs between speed and expected error

Zheng, F., & Webb, G. I. (2006). Efficient Lazy Elimination for Averaged One-Dependence Estimators. ACM International Conference Proceeding Series, Vol. 148: The Proceedings of the Twenty-third International Conference on Machine Learning (ICML'06), New York, NY, pp. 1113-1120.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb06,
Title = {Efficient Lazy Elimination for Averaged One-Dependence Estimators},
Author = {F. Zheng and G.I. Webb},
Booktitle = {ACM International Conference Proceeding Series, Vol. 148: The Proceedings of the Twenty-third International Conference on Machine Learning (ICML'06)},
Year = {2006},
Address = {New York, NY},
Editor = { W. Cohen and A. Moore},
Pages = {1113 - 1120},
Publisher = {ACM Press},
Abstract = {Semi-naive Bayesian classifiers seek to retain the numerous strengths of naive Bayes while reducing error by weakening the attribute independence assumption. Backwards Sequential Elimination (BSE) is a wrapper technique for attribute elimination that has proved effective at this task. We explore a new efficient technique, Lazy Elimination (LE), which eliminates highly related attribute-values at classification time without the computational overheads inherent in wrapper techniques. We analyze the effect of LE and BSE on Averaged One-Dependence Estimators (AODE), a state-of-the-art semi-naive Bayesian algorithm. Our extensive experiments show that LE significantly reduces bias and error without undue additional computation, while BSE significantly reduces bias but not error, with high training time complexity. In the context of AODE, LE has a significant advantage over BSE in both computational efficiency and error.},
Audit-trail = {ISBN:1-59593-383-2, DOI http://doi.acm.org/10.1145/1143844.1143984},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Pittsburgh, Pennsylvania},
Related = {learning-complex-conditional-probabilities-from-data},
Url = {http://dl.acm.org/authorize?N00547}
}
ABSTRACT Semi-naive Bayesian classifiers seek to retain the numerous strengths of naive Bayes while reducing error by weakening the attribute independence assumption. Backwards Sequential Elimination (BSE) is a wrapper technique for attribute elimination that has proved effective at this task. We explore a new efficient technique, Lazy Elimination (LE), which eliminates highly related attribute-values at classification time without the computational overheads inherent in wrapper techniques. We analyze the effect of LE and BSE on Averaged One-Dependence Estimators (AODE), a state-of-the-art semi-naive Bayesian algorithm. Our extensive experiments show that LE significantly reduces bias and error without undue additional computation, while BSE significantly reduces bias but not error, with high training time complexity. In the context of AODE, LE has a significant advantage over BSE in both computational efficiency and error.

Webb, G. I. (2005). K-Optimal Pattern Discovery: An Efficient and Effective Approach to Exploratory Data Mining. Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)[Extended Abstract], Berlin/Heidelberg, pp. 1-2.
[PDF] [Bibtex]

@InProceedings{Webb05a,
Title = {K-Optimal Pattern Discovery: An Efficient and Effective Approach to Exploratory Data Mining},
Author = {G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)[Extended Abstract]},
Year = {2005},
Address = {Berlin/Heidelberg},
Editor = {S. Zhang and R. Jarvis},
Pages = {1-2},
Publisher = {Springer},
Audit-trail = {http://dx.doi.org/10.1007/11589990_1},
Keywords = {Association Rule Discovery},
Location = {Sydney, Australia}
}
ABSTRACT 

Zheng, F., & Webb, G. I. (2005). A Comparative Study of Semi-naive Bayes Methods in Classification Learning. Proceedings of the Fourth Australasian Data Mining Conference (AusDM05), Sydney, pp. 141-156.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb05,
Title = {A Comparative Study of Semi-naive Bayes Methods in Classification Learning},
Author = {F. Zheng and G.I. Webb},
Booktitle = {Proceedings of the Fourth Australasian Data Mining Conference (AusDM05)},
Year = {2005},
Address = {Sydney},
Editor = {S.J. Simoff and G.J. Williams and J. Galloway and I. Kolyshkina },
Pages = {141-156},
Publisher = {University of Technology},
Abstract = {Numerous techniques have sought to improve the accuracy of Naive Bayes (NB) by alleviating the attribute interdependence problem. This paper summarizes these semi-naive Bayesian methods into two groups: those that apply conventional NB with a new attribute set, and those that alter NB by allowing inter-dependencies between attributes. We review eight typical semi-naive Bayesian learning algorithms and perform error analysis using the bias-variance decomposition on thirty-six natural domains from the UCI Machine Learning Repository. In analysing the results of these experiments we provide general recommendations for selection between methods.},
Keywords = {AODE and Conditional Probability Estimation},
Location = {Sydney, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Numerous techniques have sought to improve the accuracy of Naive Bayes (NB) by alleviating the attribute interdependence problem. This paper summarizes these semi-naive Bayesian methods into two groups: those that apply conventional NB with a new attribute set, and those that alter NB by allowing inter-dependencies between attributes. We review eight typical semi-naive Bayesian learning algorithms and perform error analysis using the bias-variance decomposition on thirty-six natural domains from the UCI Machine Learning Repository. In analysing the results of these experiments we provide general recommendations for selection between methods.

Huang, S., & Webb, G. I. (2005). Discarding Insignificant Rules During Impact Rule Discovery in Large, Dense Databases. Proceedings of the Fifth SIAM International Conference on Data Mining (SDM'05) [short paper], Philadelphia, PA, pp. 541-545.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{HuangWebb05,
Title = {Discarding Insignificant Rules During Impact Rule Discovery in Large, Dense Databases},
Author = {S. Huang and G.I. Webb},
Booktitle = {Proceedings of the Fifth {SIAM} International Conference on Data Mining ({SDM}'05) [short paper]},
Year = {2005},
Address = {Philadelphia, PA},
Editor = {H. Kargupta and C. Kamath and J. Srivastava and A. Goodman},
Pages = {541-545},
Publisher = {Society for Industrial and Applied Mathematics},
Abstract = {Considerable progress has been made on how to reduce the number of spurious exploratory rules with quantitative attributes. However, little has been done for rules with undiscretized quantitative attributes. It is argued that propositional rules can not effectively describe the interactions between quantitative and qualitative attributes. Aumann and Lindell proposed quantitative association rules to provide a better description of such relationship, together with a rule pruning techniques . Since their technique is based on the frequent itemset framework, it is not suitable for rule discovery in large, dense databases. In this paper, an efficient technique for automatically discarding insignificant rules during rule discovery is proposed, based on the OPUS search algorithm. Experiments demonstrate that the algorithm we propose can efficiently remove potentially uninteresting rules even in very large, dense databases.},
Audit-trail = {Shiying travelling to present paper. Requested permission to post pdf 10/2},
Keywords = {Impact Rules and OPUS},
Location = {Newport Beach, CA},
Related = {impact-rules}
}
ABSTRACT Considerable progress has been made on how to reduce the number of spurious exploratory rules with quantitative attributes. However, little has been done for rules with undiscretized quantitative attributes. It is argued that propositional rules can not effectively describe the interactions between quantitative and qualitative attributes. Aumann and Lindell proposed quantitative association rules to provide a better description of such relationship, together with a rule pruning techniques . Since their technique is based on the frequent itemset framework, it is not suitable for rule discovery in large, dense databases. In this paper, an efficient technique for automatically discarding insignificant rules during rule discovery is proposed, based on the OPUS search algorithm. Experiments demonstrate that the algorithm we propose can efficiently remove potentially uninteresting rules even in very large, dense databases.

Yang, Y., Webb, G. I., & Wu, X. (2005). Discretization Methods. In Maimon, O., & Rokach, L. (Eds.), In The Data Mining and Knowledge Discovery Handbook (, pp. 113-130). Berlin: Springer.
[DOI] [Bibtex]

@InCollection{YangWebbWu05,
Title = {Discretization Methods},
Author = {Y. Yang and G. I. Webb and X. Wu},
Booktitle = {The Data Mining and Knowledge Discovery Handbook},
Publisher = {Springer},
Year = {2005},
Address = {Berlin},
Editor = {O. Maimon and L. Rokach },
Pages = {113-130},
Doi = {10.1007/978-0-387-09823-4_6}
}
ABSTRACT 

Webb, G. I., Boughton, J., & Wang, Z. (2005). Not So Naive Bayes: Aggregating One-Dependence Estimators. Machine Learning, 58(1), 5-24.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbBoughtonWang05,
Title = {Not So Naive {Bayes}: Aggregating One-Dependence Estimators},
Author = {G. I. Webb and J. Boughton and Z. Wang},
Journal = {Machine Learning},
Year = {2005},
Number = {1},
Pages = {5-24},
Volume = {58},
Abstract = {Of numerous proposals to improve the accuracy of naive Bayes by weakening its attribute independence assumption, both LBR and TAN have demonstrated remarkable error performance. However, both techniques obtain this outcome at a considerable computational cost. We present a new approach to weakening the attribute independence assumption by averaging all of a constrained class of classifiers. In extensive experiments this technique delivers comparable prediction accuracy to LBR and TAN with substantially improved computational efficiency.},
Address = {Netherlands},
Audit-trail = {3/5/04 Pre-print posted},
Doi = {10.1007/s10994-005-4258-6},
Keywords = {Conditional Probability Estimation and AODE},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Of numerous proposals to improve the accuracy of naive Bayes by weakening its attribute independence assumption, both LBR and TAN have demonstrated remarkable error performance. However, both techniques obtain this outcome at a considerable computational cost. We present a new approach to weakening the attribute independence assumption by averaging all of a constrained class of classifiers. In extensive experiments this technique delivers comparable prediction accuracy to LBR and TAN with substantially improved computational efficiency.

Siu, K. K. W., Butler, S. M., Beveridge, T., Gillam, J. E., Hall, C. J., Kaye, A. H., Lewis, R. A., Mannan, K., McLoughlin, G., Pearson, S., Round, A. R., Schultke, E., Webb, G. I., & Wilkinson, S. J. (2005). Identifying markers of pathology in SAXS data of malignant tissues of the brain. Nuclear Instruments and Methods in Physics Research A, 548, 140-146.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{SiuEtAl05,
Title = {Identifying markers of pathology in SAXS data of malignant tissues of the brain},
Author = {K.K.W. Siu and S.M. Butler and T. Beveridge and J.E. Gillam and C.J. Hall and A.H. Kaye and R.A. Lewis and K. Mannan and G. McLoughlin and S. Pearson and A.R. Round and E. Schultke and G.I. Webb and S.J. Wilkinson},
Journal = {Nuclear Instruments and Methods in Physics Research A},
Year = {2005},
Pages = {140-146},
Volume = {548},
Abstract = {Conventional neuropathological analysis for brain malignancies is heavily reliant on the observation of morphological abnormalities, observed in thin, stained sections of tissue. Small Angle X-ray Scattering (SAXS) data provide an alternative means of distinguishing pathology by examining the ultra-structural (nanometer length scales) characteristics of tissue. To evaluate the diagnostic potential of SAXS for brain tumors, data was collected from normal, malignant and benign tissues of the human brain at station 2.1 of the Daresbury Laboratory Synchrotron Radiation Source and subjected to data mining and multivariate statistical analysis. The results suggest SAXS data may be an effective classi.er of malignancy.},
Doi = {10.1016/j.nima.2005.03.081},
Keywords = {Bioinformatics},
Publisher = {Elsevier},
Related = {computational-biology}
}
ABSTRACT Conventional neuropathological analysis for brain malignancies is heavily reliant on the observation of morphological abnormalities, observed in thin, stained sections of tissue. Small Angle X-ray Scattering (SAXS) data provide an alternative means of distinguishing pathology by examining the ultra-structural (nanometer length scales) characteristics of tissue. To evaluate the diagnostic potential of SAXS for brain tumors, data was collected from normal, malignant and benign tissues of the human brain at station 2.1 of the Daresbury Laboratory Synchrotron Radiation Source and subjected to data mining and multivariate statistical analysis. The results suggest SAXS data may be an effective classi.er of malignancy.

Huang, S., & Webb, G. I. (2005). Pruning Derivative Partial Rules During Impact Rule Discovery. Lecture Notes in Computer Science Vol. 3518: Proceedings of the 9th Pacific-Asia Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2005), Berlin/Heidelberg, pp. 71-80.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{HuangWebb05a,
Title = {Pruning Derivative Partial Rules During Impact Rule Discovery},
Author = {S. Huang and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 3518: Proceedings of the 9th {Pacific}-{Asia} Conference on Advances in Knowledge Discovery and Data Mining (PAKDD 2005)},
Year = {2005},
Address = {Berlin/Heidelberg},
Editor = {T.B. Ho and D. Cheung and H. Liu },
Pages = {71-80},
Publisher = {Springer},
Abstract = {Because exploratory rule discovery works with data that is only a sample of the phenomena to be investigated, some resulting rules may appear interesting only by chance. Techniques are developed for automatically discarding statistically insignificant exploratory rules that cannot survive a hypothesis with regard to its ancestors. We call such insignificant rules derivative extended rules. In this paper, we argue that there is another type of derivative exploratory rules, which is derivative with regard to their children. We also argue that considerable amount of such derivative partial rules can not be successfully removed using existing rule pruning techniques. We propose a new technique to address this problem. Experiments are done in impact rule discovery to evaluate the effect of this derivative partial rule filter. Results show that the inherent problem of too many resulting rules in exploratory rule discovery is alleviated.},
Keywords = {Impact Rules},
Location = {Hanoi, Vietnam},
Related = {impact-rules}
}
ABSTRACT Because exploratory rule discovery works with data that is only a sample of the phenomena to be investigated, some resulting rules may appear interesting only by chance. Techniques are developed for automatically discarding statistically insignificant exploratory rules that cannot survive a hypothesis with regard to its ancestors. We call such insignificant rules derivative extended rules. In this paper, we argue that there is another type of derivative exploratory rules, which is derivative with regard to their children. We also argue that considerable amount of such derivative partial rules can not be successfully removed using existing rule pruning techniques. We propose a new technique to address this problem. Experiments are done in impact rule discovery to evaluate the effect of this derivative partial rule filter. Results show that the inherent problem of too many resulting rules in exploratory rule discovery is alleviated.

Webb, G. I., & Ting, K. M. (2005). On the Application of ROC Analysis to Predict Classification Performance Under Varying Class Distributions. Machine Learning, 58(1), 25-32.
[PDF] [Bibtex] [Abstract]

@Article{WebbTing05,
Title = {On the Application of ROC Analysis to Predict Classification Performance Under Varying Class Distributions},
Author = {G. I. Webb and K.M. Ting},
Journal = {Machine Learning},
Year = {2005},
Number = {1},
Pages = {25-32},
Volume = {58},
Abstract = {We counsel caution in the application of ROC analysis for prediction of classifier accuracy under varying class distributions. The heart of our contention is that in real-world applications variations of class distribution are likely to result from forces that affect the distribution of the attribute-values, rather than forces that directly affect the class distribution. In statistical terms, it is usually the class, rather than the attributes, that is the dependent variable. If the class distribution alters as an indirect consequence of changes in the distribution of the attribute values, rather than vice versa, performance estimates derived through ROC analysis may be grossly inaccurate.},
Address = {Netherlands},
Audit-trail = {22/4 Preprint pdf posted},
Publisher = {Springer}
}
ABSTRACT We counsel caution in the application of ROC analysis for prediction of classifier accuracy under varying class distributions. The heart of our contention is that in real-world applications variations of class distribution are likely to result from forces that affect the distribution of the attribute-values, rather than forces that directly affect the class distribution. In statistical terms, it is usually the class, rather than the attributes, that is the dependent variable. If the class distribution alters as an indirect consequence of changes in the distribution of the attribute values, rather than vice versa, performance estimates derived through ROC analysis may be grossly inaccurate.

Yang, Y., Korb, K., Ting, K-M., & Webb, G. I. (2005). Ensemble Selection for SuperParent-One-Dependence Estimators. Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005), Berlin/Heidelberg, pp. 102-111.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangKorbTingWebb05,
Title = {Ensemble Selection for SuperParent-One-Dependence Estimators},
Author = {Y. Yang and K. Korb and K-M. Ting and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 3809: Advances in Artificial Intelligence, Proceedings of the 18th Australian Joint Conference on Artificial Intelligence (AI 2005)},
Year = {2005},
Address = {Berlin/Heidelberg},
Editor = {S. Zhang and R. Jarvis},
Pages = {102-111},
Publisher = {Springer},
Abstract = {SuperParent-One-Dependence Estimators (SPODEs) loosen Naive-Bayes' attribute independence assumption by allowing each attribute to depend on a common single attribute (superparent) in addition to the class. An ensemble of SPODEs is able to achieve high classification accuracy with modest computational cost. This paper investigates how to select SPODEs for ensembling. Various popular model selection strategies are presented. Their learning efficacy and efficiency are theoretically analyzed and empirically verified. Accordingly, guidelines are investigated for choosing between selection criteria in differing contexts.},
Audit-trail = {http://dx.doi.org/10.1007/11589990_13},
Keywords = {Conditional Probablity Estimation and AODE},
Location = {Sydney, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT SuperParent-One-Dependence Estimators (SPODEs) loosen Naive-Bayes' attribute independence assumption by allowing each attribute to depend on a common single attribute (superparent) in addition to the class. An ensemble of SPODEs is able to achieve high classification accuracy with modest computational cost. This paper investigates how to select SPODEs for ensembling. Various popular model selection strategies are presented. Their learning efficacy and efficiency are theoretically analyzed and empirically verified. Accordingly, guidelines are investigated for choosing between selection criteria in differing contexts.

Webb, G. I., & Zhang, S. (2005). k-Optimal-Rule-Discovery. Data Mining and Knowledge Discovery, 10(1), 39-79.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbZhang05,
Title = {k-Optimal-Rule-Discovery},
Author = {G. I. Webb and S. Zhang},
Journal = {Data Mining and Knowledge Discovery},
Year = {2005},
Number = {1},
Pages = {39-79},
Volume = {10},
Abstract = {K-most-interesting rule discovery finds the k rules that optimize a user-specified measure of interestingness with respect to a set of sample data and user-specified constraints. This approach avoids many limitations of the frequent itemset approach of association rule discovery. This paper presents a scalable algorithm applicable to a wide range of k-most-interesting rule discovery tasks and demonstrates its efficiency.},
Address = {Netherlands},
Doi = {10.1007/s10618-005-0255-4},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Publisher = {Springer},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT K-most-interesting rule discovery finds the k rules that optimize a user-specified measure of interestingness with respect to a set of sample data and user-specified constraints. This approach avoids many limitations of the frequent itemset approach of association rule discovery. This paper presents a scalable algorithm applicable to a wide range of k-most-interesting rule discovery tasks and demonstrates its efficiency.

Webb, G. I., & Conilione, P. (2004). Estimating bias and variance from data. Unpublished manuscript.
[PDF] [Bibtex] [Abstract]

@Unpublished{WebbConilione04,
Title = {Estimating bias and variance from data},
Author = {Webb, Geoffrey I and Conilione, Paul},
Note = {Unpublished manuscript},
Year = {2004},
Abstract = {The bias-variance decomposition of error provides useful insights into the error performance of a classifier as it is applied to di#erent types of learning task. Most notably, it has been used to explain the extraordinary e#ectiveness of ensemble learning techniques. It is important that the research community have e#ective tools for assessing such explanations. To this end, techniques have been developed for estimating bias and variance from data. The most widely deployed of these uses repeated sub-sampling with a holdout set. We argue, with empirical support, that this approach has serious limitations. First, it provides very little flexibility in the types of distributions of training sets that may be studied. It requires that the training sets be relatively small and that the degree of variation between training sets be very circumscribed. Second, the approach leads to bias and variance estimates that have high statistical variance and hence low reliability. We develop an alternative method that is based on cross-validation. We show that this method allows far greater flexibility in the types of distribution that are examined and that the estimates derived are much more stable. Finally, we show that changing the distributions of training sets from which bias and variance estimates are drawn can alter substantially the bias and variance estimates that are derived.},
Keywords = {Learning from large datasets and Bias-Variance}
}
ABSTRACT The bias-variance decomposition of error provides useful insights into the error performance of a classifier as it is applied to di#erent types of learning task. Most notably, it has been used to explain the extraordinary e#ectiveness of ensemble learning techniques. It is important that the research community have e#ective tools for assessing such explanations. To this end, techniques have been developed for estimating bias and variance from data. The most widely deployed of these uses repeated sub-sampling with a holdout set. We argue, with empirical support, that this approach has serious limitations. First, it provides very little flexibility in the types of distributions of training sets that may be studied. It requires that the training sets be relatively small and that the degree of variation between training sets be very circumscribed. Second, the approach leads to bias and variance estimates that have high statistical variance and hence low reliability. We develop an alternative method that is based on cross-validation. We show that this method allows far greater flexibility in the types of distribution that are examined and that the estimates derived are much more stable. Finally, we show that changing the distributions of training sets from which bias and variance estimates are drawn can alter substantially the bias and variance estimates that are derived.

Thiruvady, D. R., & Webb, G. I. (2004). Mining Negative Rules using GRD. Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 04) [Short Paper], Berlin/Heidelberg, pp. 161-165.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ThiruvadyWebb04,
Title = {Mining Negative Rules using GRD},
Author = {D. R. Thiruvady and G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD 04) [Short Paper]},
Year = {2004},
Address = {Berlin/Heidelberg},
Editor = {H. Dai and R. Srikant and C. Zhang },
Pages = {161-165},
Publisher = {Springer},
Abstract = {GRD is an algorithm for k-most interesting rule discovery. In contrast to association rule discovery, GRD does not require the use of a minimum support constraint. Rather, the user must specify a measure of interestingness and the number of rules sought (k). This paper reports efficient techniques to extend GRD to support mining of negative rules. We demonstrate that the new approach provides tractable discovery of both negative and positive rules.},
Audit-trail = {PDF posted 23/8},
Keywords = {association rule discovery and opus},
Location = {Sydney, Australia},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT GRD is an algorithm for k-most interesting rule discovery. In contrast to association rule discovery, GRD does not require the use of a minimum support constraint. Rather, the user must specify a measure of interestingness and the number of rules sought (k). This paper reports efficient techniques to extend GRD to support mining of negative rules. We demonstrate that the new approach provides tractable discovery of both negative and positive rules.

Newlands, D. A., & Webb, G. I. (2004). Convex Hulls as an Hypothesis Language Bias. Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV), Southampton, UK, pp. 285-294.
[PDF] [Bibtex] [Abstract]

@InProceedings{NewlandsWebb04,
Title = {Convex Hulls as an Hypothesis Language Bias},
Author = {D.A. Newlands and G.I. Webb},
Booktitle = {Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV)},
Year = {2004},
Address = {Southampton, UK},
Editor = {N.F.F.E. Ebecken and C.A. Brebbia and A. Zanasi},
Pages = {285-294},
Publisher = {WIT Press},
Abstract = {Classification learning is dominated by systems which induce large numbers of small axis-orthogonal decision surfaces which biases such systems towards particular hypothesis types. However, there is reason to believe that many domains have underlying concepts which do not involve axis orthogonal surfaces. Further, the multiplicity of small decision regions mitigates against any holistic appreciation of the theories produced by these systems, notwithstanding the fact that many of the small regions are individually comprehensible. We propose the use of less strongly biased hypothesis languages which might be expected to model concepts using a number of structures close to the number of actual structures in the domain. An instantiation of such a language, a convex hull based classifier, CH1, has been implemented to investigate modeling concepts as a small number of large geometric structures in n-dimensional space. A comparison of the number of regions induced is made against other well-known systems on a representative selection of largely or wholly continuous valued machine learning tasks. The convex hull system is shown to produce a number of induced regions about an order of magnitude less than well-known systems and very close to the number of actual concepts. This representation, as convex hulls, allows the possibility of extraction of higher level mathematical descriptions of the induced concepts, using the techniques of computational geometry.},
Audit-trail = {Poor quality pdf posted has no ref},
Location = {Rio de Janeiro, Brazil}
}
ABSTRACT Classification learning is dominated by systems which induce large numbers of small axis-orthogonal decision surfaces which biases such systems towards particular hypothesis types. However, there is reason to believe that many domains have underlying concepts which do not involve axis orthogonal surfaces. Further, the multiplicity of small decision regions mitigates against any holistic appreciation of the theories produced by these systems, notwithstanding the fact that many of the small regions are individually comprehensible. We propose the use of less strongly biased hypothesis languages which might be expected to model concepts using a number of structures close to the number of actual structures in the domain. An instantiation of such a language, a convex hull based classifier, CH1, has been implemented to investigate modeling concepts as a small number of large geometric structures in n-dimensional space. A comparison of the number of regions induced is made against other well-known systems on a representative selection of largely or wholly continuous valued machine learning tasks. The convex hull system is shown to produce a number of induced regions about an order of magnitude less than well-known systems and very close to the number of actual concepts. This representation, as convex hulls, allows the possibility of extraction of higher level mathematical descriptions of the induced concepts, using the techniques of computational geometry.

Wang, Z., Webb, G. I., & Zheng, F. (2004). Selective Augmented Bayesian Network Classifiers Based on Rough Set Theory. Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD 04), Berlin/Heidelberg, pp. 319-328.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebbZheng04,
Title = {Selective Augmented Bayesian Network Classifiers Based on Rough Set Theory},
Author = {Z. Wang and G.I. Webb and F. Zheng},
Booktitle = {Lecture Notes in Computer Science Vol. 3056: Proceedings of the Eighth {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD 04)},
Year = {2004},
Address = {Berlin/Heidelberg},
Editor = {H. Dai and R. Srikant and C. Zhang},
Pages = {319-328},
Publisher = {Springer},
Abstract = {The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. TAN, Tree-Augmented Naive Bayes, is a state-of-the-art extension of naive Bayes, that can express limited forms of inter-dependence among attributes. Rough sets theory provides tools for expressing inexact or partial dependencies within dataset. In this paper, we present a variant of TAN and compare their tree classifier structures, which can be thought of as a selective restricted trees Bayesian classifier. It delivers lower error than both pre-existing state-of-the-art TAN-based classifiers, with substantially less computation than is required by the SuperParent approach.},
Audit-trail = {PDF posted 23/8},
Keywords = {Conditional Probability Estimation and AODE and Learning from large datasets},
Location = {Sydney, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. TAN, Tree-Augmented Naive Bayes, is a state-of-the-art extension of naive Bayes, that can express limited forms of inter-dependence among attributes. Rough sets theory provides tools for expressing inexact or partial dependencies within dataset. In this paper, we present a variant of TAN and compare their tree classifier structures, which can be thought of as a selective restricted trees Bayesian classifier. It delivers lower error than both pre-existing state-of-the-art TAN-based classifiers, with substantially less computation than is required by the SuperParent approach.

Webb, G. I., & Yu, X. (Ed). (2004). Lecture Notes in Computer Science 3339: Proceedings of the 17th Australian Joint Conference on Artificial Intelligence (AI 2004). Berlin: Springer.
[Bibtex]

@Proceedings{WebbYu04,
Title = {Lecture Notes in Computer Science 3339: Proceedings of the 17th Australian Joint Conference on Artificial Intelligence (AI 2004)},
Year = {2004},
Address = {Berlin},
Editor = {G. I. Webb and X. Yu},
Publisher = {Springer},
Series = {Lecture Notes in Computer Science},
Location = {Cairns, Australia}
}
ABSTRACT 

Webb, G. I., & Zheng, Z. (2004). Multistrategy Ensemble Learning: Reducing Error by Combining Ensemble Learning Techniques. IEEE Transactions on Knowledge and Data Engineering, 16(8), 980-991.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbZheng04,
Title = {Multistrategy Ensemble Learning: Reducing Error by Combining Ensemble Learning Techniques},
Author = {G.I. Webb and Z. Zheng},
Journal = {{IEEE} Transactions on Knowledge and Data Engineering},
Year = {2004},
Number = {8},
Pages = {980-991},
Volume = {16},
Abstract = {Ensemble learning strategies, especially Boosting and Bagging decision trees, have demonstrated impressive capacities to improve the prediction accuracy of base learning algorithms. Further gains have been demonstrated by strategies that combine simple ensemble formation approaches. In this paper, we investigate the hypothesis that the improvement inaccuracy of multi-strategy approaches to ensemble learning is due to an increase in the diversity of ensemble members that are formed. In addition, guided by this hypothesis, we develop three new multi-strategy ensemble-learning techniques. Experimental results in a wide variety of natural domains suggest that these multi-strategy ensemble-learning techniques are, on average, more accurate than their component ensemble learning techniques},
Address = {Los Alamitos, CA},
Audit-trail = {Due for publication approx July 2004. {IEEE} copyright signed. 28/10/03 No paper posted - link to TKDE site given},
Keywords = {MultiBoosting and Boosting},
Publisher = {{IEEE} Computer Society},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Ensemble learning strategies, especially Boosting and Bagging decision trees, have demonstrated impressive capacities to improve the prediction accuracy of base learning algorithms. Further gains have been demonstrated by strategies that combine simple ensemble formation approaches. In this paper, we investigate the hypothesis that the improvement inaccuracy of multi-strategy approaches to ensemble learning is due to an increase in the diversity of ensemble members that are formed. In addition, guided by this hypothesis, we develop three new multi-strategy ensemble-learning techniques. Experimental results in a wide variety of natural domains suggest that these multi-strategy ensemble-learning techniques are, on average, more accurate than their component ensemble learning techniques

Newlands, D. A., & Webb, G. I. (2004). Alternative Strategies for Decision List Construction. Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV), Southampton, UK, pp. 265-273.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{NewlandsWebb04a,
Title = {Alternative Strategies for Decision List Construction},
Author = {D.A. Newlands and G.I. Webb},
Booktitle = {Proceedings of the Fourth International Conference on Data Mining (DATA MINING IV)},
Year = {2004},
Address = {Southampton, UK},
Editor = {N.F.F.E. Ebecken and C.A. Brebbia and A. Zanasi},
Pages = {265-273},
Publisher = {WIT Press},
Abstract = {This work surveys well-known approaches to building decision lists. Some novel variations to strategies based on default rules for the most common class and insertion of new rules before the default rule are presented. These are expected to offer speed up in the construction of the decision list as well as compression of the length of the list. These strategies and a testing regime have been implemented and some empirical studies done to compare the strategies. Experimental results are presented and interpreted. We show that all strategies deliver decision lists of comparable accuracy. However, two techniques are shown to deliver this accuracy with lists composed of significantly fewer rules than alternative strategies. Of these, one also demonstrates significant computational advantages. The prepending strategy is also demonstrated to produce decision lists which are as much as an order of magnitude shorter than those produced by CN2.},
Audit-trail = {Paper posted on web 9/8/04},
Keywords = {Prepend},
Location = {Rio de Janeiro, Brazil},
Related = {prepending}
}
ABSTRACT This work surveys well-known approaches to building decision lists. Some novel variations to strategies based on default rules for the most common class and insertion of new rules before the default rule are presented. These are expected to offer speed up in the construction of the decision list as well as compression of the length of the list. These strategies and a testing regime have been implemented and some empirical studies done to compare the strategies. Experimental results are presented and interpreted. We show that all strategies deliver decision lists of comparable accuracy. However, two techniques are shown to deliver this accuracy with lists composed of significantly fewer rules than alternative strategies. Of these, one also demonstrates significant computational advantages. The prepending strategy is also demonstrated to produce decision lists which are as much as an order of magnitude shorter than those produced by CN2.

Yang, Y., & Webb, G. I. (2003). On Why Discretization Works for Naive-Bayes Classifiers. Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 440-452.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebb03c,
Title = {On Why Discretization Works for Naive-Bayes Classifiers},
Author = {Y. Yang and G. I. Webb},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03)},
Year = {2003},
Address = {Berlin/Heidelberg},
Editor = {T.D. Gedeon and L.C.C. Fung },
Pages = {440-452},
Publisher = {Springer},
Abstract = {We investigate why discretization is effective in naive-Bayes learning. We prove a theorem that identifies particular conditions under which discretization will result in naive Bayes classifiers delivering the same probability estimates as would be obtained if the correct probability density functions were employed. We discuss the factors that might affect naive-Bayes classification error under discretization. We suggest that the use of different discretization techniques can affect the classification bias and variance of the generated classifiers, an effect named discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error},
Keywords = {Discretization for Naive Bayes},
Location = {Perth, Australia},
Related = {discretization-for-naive-bayes}
}
ABSTRACT We investigate why discretization is effective in naive-Bayes learning. We prove a theorem that identifies particular conditions under which discretization will result in naive Bayes classifiers delivering the same probability estimates as would be obtained if the correct probability density functions were employed. We discuss the factors that might affect naive-Bayes classification error under discretization. We suggest that the use of different discretization techniques can affect the classification bias and variance of the generated classifiers, an effect named discretization bias and variance. We argue that by properly managing discretization bias and variance, we can effectively reduce naive-Bayes classification error

Yang, Y., & Webb, G. I. (2003). Weighted Proportional k-Interval Discretization for Naive-Bayes Classifiers. Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD'03), Berlin/Heidelberg, pp. 501-512.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebb03,
Title = {Weighted Proportional k-Interval Discretization for Naive-Bayes Classifiers},
Author = {Y. Yang and G.I. Webb},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD'03)},
Year = {2003},
Address = {Berlin/Heidelberg},
Editor = {K-Y. Whang and J. Jeon and K. Shim and J. Srivastava },
Pages = {501-512},
Publisher = {Springer-Verlag},
Abstract = {The use of different discretization techniques can be expected to affect the bias and variance of a learning algorithm. We call such an effect discretization bias and variance. Proportional k-interval discretization (PKID) tunes discretization bias and variance by adjusting discretized interval size and number proportional to the number of training instances. Theoretical analysis suggests that this is desirable for naive-Bayes classifiers. However PKID has sub-optimal performance when learning from small training data. We argue that this is because PKID equally weighs bias reduction and variance reduction. But for small data, variance reduction can contribute more to lower learning error and thus should be given greater weight than bias reduction. Accordingly we propose weighted proportional k-interval discretization (WPKID), which establishes a more suitable bias and variance trade-off for small data while allowing additional training data to be used to reduce both bias and variance. Our experiments demonstrate that for naive-Bayes classifiers, WPKID improves upon PKID for smaller datasets with significant frequency; and WPKID delivers lower classification error significantly more often than not in comparison to the other three leading alternative discretization techniques studied.},
Audit-trail = {Waiting on copy of copyright form from Ying},
Keywords = {Discretization for Naive Bayes},
Location = {Seoul, Korea},
Related = {discretization-for-naive-bayes}
}
ABSTRACT The use of different discretization techniques can be expected to affect the bias and variance of a learning algorithm. We call such an effect discretization bias and variance. Proportional k-interval discretization (PKID) tunes discretization bias and variance by adjusting discretized interval size and number proportional to the number of training instances. Theoretical analysis suggests that this is desirable for naive-Bayes classifiers. However PKID has sub-optimal performance when learning from small training data. We argue that this is because PKID equally weighs bias reduction and variance reduction. But for small data, variance reduction can contribute more to lower learning error and thus should be given greater weight than bias reduction. Accordingly we propose weighted proportional k-interval discretization (WPKID), which establishes a more suitable bias and variance trade-off for small data while allowing additional training data to be used to reduce both bias and variance. Our experiments demonstrate that for naive-Bayes classifiers, WPKID improves upon PKID for smaller datasets with significant frequency; and WPKID delivers lower classification error significantly more often than not in comparison to the other three leading alternative discretization techniques studied.

Webb, G. I. (2003). Association Rules. In Ye, D. N. (Ed.), In The Handbook of Data Mining, Chapter 2 (pp. 25-39). Lawrence Erlbaum Associates.
[Bibtex]

@InCollection{Webb03,
Title = {Association Rules},
Author = {G. I. Webb},
Booktitle = {The Handbook of Data Mining, Chapter 2},
Publisher = {Lawrence Erlbaum Associates},
Year = {2003},
Editor = {Dr. Nong Ye},
Pages = {25 - 39 },
Audit-trail = {*},
Keywords = {Association Rule Discovery}
}
ABSTRACT 

Zhang, C., Zhang, S., & Webb, G. I. (2003). Identifying Approximate Item-Sets Of Interest In Large Databases. Applied Intelligence, 18, 91-104.
[URL] [Bibtex] [Abstract]

@Article{ZhangZhangWebb03,
Title = {Identifying Approximate Item-Sets Of Interest In Large Databases},
Author = {C. Zhang and S. Zhang and G. I. Webb},
Journal = {Applied Intelligence},
Year = {2003},
Pages = {91-104},
Volume = {18},
Abstract = {This paper presents a method for discovering approximate frequent itemsets of interest in large scale databases. This method uses the central limit theorem to increase efficiency, enabling us to reduce the sample size by about half compared to previous approximations. Further efficiency is gained by pruning from the search space uninteresting frequent itemsets. In addition to improving efficiency, this measure also reduces the number of itemsets that the user need consider. The model and algorithm have been implemented and evaluated using both synthetic and real-world databases. Our experimental results demonstrate the efficiency of the approach},
Address = {Netherlands},
Audit-trail = {Link to paper via Kluwer site. No PDF posted},
Keywords = {Association Rule Discovery},
Publisher = {Springer},
Url = {http://link.springer.com/content/pdf/10.1023%2FA%3A1020995206763.pdf}
}
ABSTRACT This paper presents a method for discovering approximate frequent itemsets of interest in large scale databases. This method uses the central limit theorem to increase efficiency, enabling us to reduce the sample size by about half compared to previous approximations. Further efficiency is gained by pruning from the search space uninteresting frequent itemsets. In addition to improving efficiency, this measure also reduces the number of itemsets that the user need consider. The model and algorithm have been implemented and evaluated using both synthetic and real-world databases. Our experimental results demonstrate the efficiency of the approach

Webb, G. I., Butler, S., & Newlands, D. (2003). On Detecting Differences Between Groups. Proceedings of The Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2003), New York, pp. 256-265.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbButlerNewlands03,
Title = {On Detecting Differences Between Groups},
Author = {G. I. Webb and S. Butler and D. Newlands},
Booktitle = {Proceedings of The Ninth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2003)},
Year = {2003},
Address = {New York},
Editor = {P. Domingos and C. Faloutsos and T. Senator and H. Kargupta and L. Getoor},
Pages = {256-265},
Publisher = {The Association for Computing Machinery},
Abstract = {Understanding the differences between contrasting groups is a fundamental task in data analysis. This realization has led to the development of a new special purpose data mining technique, {\em contrast-set mining}. We undertook a study with a retail collaborator to compare contrast-set mining with existing rule-discovery techniques. To our surprise we observed that straightforward application of an existing commercial rule-discovery system, Magnum Opus, could successfully perform the contrast-set-mining task. This led to the realization that contrast-set mining is a special case of the more general rule-discovery task. We present the results of our study together with a proof of this conclusion},
Audit-trail = {PDF with ACM copyright posted in accordance with conditions of copyright},
Keywords = {OPUS and Association Rule Discovery},
Location = {Washington, DC},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT Understanding the differences between contrasting groups is a fundamental task in data analysis. This realization has led to the development of a new special purpose data mining technique, {\em contrast-set mining}. We undertook a study with a retail collaborator to compare contrast-set mining with existing rule-discovery techniques. To our surprise we observed that straightforward application of an existing commercial rule-discovery system, Magnum Opus, could successfully perform the contrast-set-mining task. This led to the realization that contrast-set mining is a special case of the more general rule-discovery task. We present the results of our study together with a proof of this conclusion

Butler, S. M., Webb, G. I., & Lewis, R. A. (2003). A Case Study in Feature Invention for Breast Cancer Diagnosis Using X-Ray Scatter Images. Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 677-685.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ButlerWebbLewis03,
Title = {A Case Study in Feature Invention for Breast Cancer Diagnosis Using X-Ray Scatter Images},
Author = {S. M. Butler and G.I. Webb and R.A. Lewis},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03)},
Year = {2003},
Address = {Berlin/Heidelberg},
Editor = {T.D. Gedeon and L.C.C. Fung },
Pages = {677-685},
Publisher = {Springer},
Abstract = {X-ray mammography is the current method for screening for breast cancer, and like any technique, has its limitations. Several groups have reported differences in the X-ray scattering patterns of normal and tumour tissue from the breast. This gives rise to the hope that X-ray scatter analysis techniques may lead to a more accurate and cost effective method of diagnosing beast cancer which lends itself to automation. This is a particularly challenging exercise due to the inherent complexity of the information content in X-ray scatter patterns from complex heterogenous tissue samples. We use a simple naive Bayes classier, coupled with Equal Frequency Discretization (EFD) as our classification system. High-level features are extracted from the low-level pixel data. This paper reports some preliminary results in the ongoing development of this classification method that can distinguish between the diffraction patterns of normal and cancerous tissue, with particular emphasis on the invention of features for classification.},
Doi = {10.1007/978-3-540-24581-0_58},
Keywords = {Bioinformatics},
Location = {Perth, Australia},
Related = {computational-biology}
}
ABSTRACT X-ray mammography is the current method for screening for breast cancer, and like any technique, has its limitations. Several groups have reported differences in the X-ray scattering patterns of normal and tumour tissue from the breast. This gives rise to the hope that X-ray scatter analysis techniques may lead to a more accurate and cost effective method of diagnosing beast cancer which lends itself to automation. This is a particularly challenging exercise due to the inherent complexity of the information content in X-ray scatter patterns from complex heterogenous tissue samples. We use a simple naive Bayes classier, coupled with Equal Frequency Discretization (EFD) as our classification system. High-level features are extracted from the low-level pixel data. This paper reports some preliminary results in the ongoing development of this classification method that can distinguish between the diffraction patterns of normal and cancerous tissue, with particular emphasis on the invention of features for classification.

Rolfe, B., Hodgson, P., & Webb, G. I. (2003). Improving the Prediction of the Roll Separating Force in a Hot Steel Finishing Mill. Intelligence in a Small World - Nanomaterials for the 21st Century. Selected Papers from IPMM-2003, Boca Raton, Florida.
[PDF] [Bibtex]  → Related papers and software

@InProceedings{RolfeHodgsonWebb03,
Title = {Improving the Prediction of the Roll Separating Force in a Hot Steel Finishing Mill},
Author = {B Rolfe and P Hodgson and G. I. Webb},
Booktitle = {Intelligence in a Small World - Nanomaterials for the 21st Century. Selected Papers from IPMM-2003},
Year = {2003},
Address = {Boca Raton, Florida},
Editor = {J.A. Meech},
Publisher = {CRC-Press },
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Sendai, Japan},
Related = {engineering-applications}
}
ABSTRACT 

Wang, Z., Webb, G. I., & Zheng, F. (2003). Adjusting Dependence Relations for Semi-Lazy TAN Classifiers. Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence (AI 03), Berlin/Heidelberg, pp. 453-465.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebbZheng03,
Title = {Adjusting Dependence Relations for Semi-Lazy TAN Classifiers},
Author = {Z. Wang and G.I. Webb and F. Zheng},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2903: Proceedings of the 16th Australian Conference on Artificial Intelligence ({AI} 03)},
Year = {2003},
Address = {Berlin/Heidelberg},
Editor = {T.D. Gedeon and L.C.C. Fung },
Pages = {453-465},
Publisher = {Springer},
Abstract = {The naive Bayesian classifier is a simple and effective classification method, which assumes a Bayesian network in which each attribute has the class label as its only one parent. But this assumption is not obviously hold in many real world domains. Tree-Augmented Na?ve Bayes (TAN) is a state-of-the-art extension of the naive Bayes, which can express partial dependence relations among attributes. In this paper, we analyze the implementations of two different TAN classifiers and their tree structures. Experiments show how different dependence relations impact on accuracy of TAN classifiers. We present a kind of semi-lazy TAN classifier, which builds a TAN identical to the original TAN at training time, but adjusts the dependence relations for a new test instance at classification time. Our extensive experimental results show that this kind of semi-lazy classifier delivers lower error than the original TAN and is more efficient than SuperParent TAN.},
Keywords = {Conditional Probability Estimation},
Location = {Perth, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The naive Bayesian classifier is a simple and effective classification method, which assumes a Bayesian network in which each attribute has the class label as its only one parent. But this assumption is not obviously hold in many real world domains. Tree-Augmented Na?ve Bayes (TAN) is a state-of-the-art extension of the naive Bayes, which can express partial dependence relations among attributes. In this paper, we analyze the implementations of two different TAN classifiers and their tree structures. Experiments show how different dependence relations impact on accuracy of TAN classifiers. We present a kind of semi-lazy TAN classifier, which builds a TAN identical to the original TAN at training time, but adjusts the dependence relations for a new test instance at classification time. Our extensive experimental results show that this kind of semi-lazy classifier delivers lower error than the original TAN and is more efficient than SuperParent TAN.

Shi, H., Wang, Z., Webb, G. I., & Huang, H. (2003). A New Restricted Bayesian Network Classifier. Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh Pacific-Asia Conference on Knowledge Discovery and Data Mining (PAKDD'03), Berlin/Heidelberg, pp. 265-270.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ShiWangWebbHuang03,
Title = {A New Restricted Bayesian Network Classifier},
Author = {H. Shi and Z. Wang and G.I. Webb and H. Huang},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 2637: Proceedings of the Seventh {Pacific}-{Asia} Conference on Knowledge Discovery and Data Mining (PAKDD'03)},
Year = {2003},
Address = {Berlin/Heidelberg},
Editor = {K-Y. Whang and J. Jeon and K. Shim and J. Srivastava },
Pages = {265-270},
Publisher = {Springer-Verlag},
Abstract = {On the basis of examining the existing restricted Bayesian network classifiers, a new Bayes-theorem-based and more strictly restricted Bayesian-network-based classification model DLBAN is proposed, which can be viewed as a double-level Bayesian network augmented naive Bayes classification. The experimental results show that the DLBAN classifier is better than the TAN classifier in the most cases.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation},
Location = {Seoul, Korea},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT On the basis of examining the existing restricted Bayesian network classifiers, a new Bayes-theorem-based and more strictly restricted Bayesian-network-based classification model DLBAN is proposed, which can be viewed as a double-level Bayesian network augmented naive Bayes classification. The experimental results show that the DLBAN classifier is better than the TAN classifier in the most cases.

Rolfe, B., Frayman, Y., Webb, G. I., & Hodgson, P. (2003). Analysis of Stamping Production Data with View Towards Quality Management. Proceedings of the 9th International Conference on Manufacturing Excellence (ICME 03).
[PDF] [Bibtex]  → Related papers and software

@InProceedings{RolfeFraymanWebbHodgson03,
Title = {Analysis of Stamping Production Data with View Towards Quality Management},
Author = {B. Rolfe and Y. Frayman and G.I. Webb and P. Hodgson},
Booktitle = {Proceedings of the 9th International Conference on Manufacturing Excellence (ICME 03)},
Year = {2003},
Keywords = {Engineering Applications},
Location = {Melbourne, Australia},
Related = {engineering-applications}
}
ABSTRACT 

Webb, G. I. (2003). Preliminary Investigations into Statistically Valid Exploratory Rule Discovery. Proceedings of the Second Australasian Data Mining Conference (AusDM03), Sydney, pp. 1-9.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb03a,
Title = {Preliminary Investigations into Statistically Valid Exploratory Rule Discovery},
Author = {G.I. Webb},
Booktitle = {Proceedings of the Second Australasian Data Mining Conference (AusDM03)},
Year = {2003},
Address = {Sydney},
Editor = {S.J. Simoff and G.J. Williams and M. Hegland},
Pages = {1-9},
Publisher = {University of Technology},
Abstract = {Exploratory rule discovery, as exemplified by association rule discovery, has proven very popular. In this paper I investigate issues surrounding the statistical validity of rules found using this approach and methods that might be employed to deliver statistically sound exploratory rule discovery.},
Audit-trail = {Submitted to AusDM03. No copyright required. Check key words},
Keywords = {Association Rule Discovery and statistically sound discovery and OPUS},
Location = {Canberra, Australia},
Related = {statistically-sound-association-discovery}
}
ABSTRACT Exploratory rule discovery, as exemplified by association rule discovery, has proven very popular. In this paper I investigate issues surrounding the statistical validity of rules found using this approach and methods that might be employed to deliver statistically sound exploratory rule discovery.

Webb, G. I., & Zhang, S. (2002). Removing Trivial Associations in Association Rule Discovery. Proceedings of the First International NAISO Congress on Autonomous Intelligent Systems (ICAIS 2002), Canada/The Netherlands.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbZhang02,
Title = {Removing Trivial Associations in Association Rule Discovery},
Author = {G. I. Webb and S. Zhang},
Booktitle = {Proceedings of the First International NAISO Congress on Autonomous Intelligent Systems (ICAIS 2002)},
Year = {2002},
Address = {Canada/The Netherlands},
Publisher = {NAISO Academic Press},
Abstract = {Association rule discovery has become one of the most widely applied data mining strategies. Techniques for association rule discovery have been dominated by the frequent itemset strategy as exemplified by the Apriori algorithm. One limitation of this approach is that it provides little opportunity to detect and remove association rules on the basis of relationships between rules. As a result, the association rules discovered are frequently swamped with large numbers of spurious rules that are of little interest to the user. This paper presents association rule discovery techniques that can detect and discard one form of spurious association rule: trivial associations.},
Audit-trail = {Pre-publication PDF posted},
Keywords = {OPUS and Association Rule Discovery},
Location = {Geelong, Australia},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT Association rule discovery has become one of the most widely applied data mining strategies. Techniques for association rule discovery have been dominated by the frequent itemset strategy as exemplified by the Apriori algorithm. One limitation of this approach is that it provides little opportunity to detect and remove association rules on the basis of relationships between rules. As a result, the association rules discovered are frequently swamped with large numbers of spurious rules that are of little interest to the user. This paper presents association rule discovery techniques that can detect and discard one form of spurious association rule: trivial associations.

Yang, Y., & Webb, G. I. (2002). A Comparative Study of Discretization Methods for Naive-Bayes Classifiers. Proceedings of the 2002 Pacific Rim Knowledge Acquisition Workshop (PKAW'02), Tokyo, pp. 159-173.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebb02a,
Title = {A Comparative Study of Discretization Methods for Naive-Bayes Classifiers},
Author = { Y. Yang and G. I. Webb},
Booktitle = {Proceedings of the 2002 {Pacific} Rim Knowledge Acquisition Workshop (PKAW'02)},
Year = {2002},
Address = {Tokyo},
Editor = {T. Yamaguchi and A. Hoffmann and H. Motoda and P. Compton },
Pages = {159-173},
Publisher = {Japanese Society for Artificial Intelligence},
Abstract = {Discretization is a popular approach to handling numeric attributes in machine learning. We argue that the requirements for effective discretization differ between naive-Bayes learning and many other learning algorithms. We evaluate the effectiveness with naive-Bayes classifiers of nine discretization methods, equal width discretization (EWD), equal frequency discretization (EFD), fuzzy discretization (FD), entropy minimization discretization (EMD), iterative discretization (ID), proportional k-interval discretization (PKID), lazy discretization (LD), non-disjoint discretization (NDD) and weighted proportional k-interval discretization (WPKID). It is found that in general naive-Bayes classifiers trained on data preprocessed by LD, NDD or WPKID achieve lower classification error than those trained on data preprocessed by the other discretization methods. But LD can not scale to large data. This study leads to a new discretization method, weighted non-disjoint discretization (WNDD) that combines WPKID and NDD's advantages. Our experiments show that among all the rival discretization methods, WNDD best helps naive-Bayes classifiers reduce average classification error.},
Audit-trail = {*},
Keywords = {Discretization for Naive Bayes},
Location = {Tokyo, Japan},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Discretization is a popular approach to handling numeric attributes in machine learning. We argue that the requirements for effective discretization differ between naive-Bayes learning and many other learning algorithms. We evaluate the effectiveness with naive-Bayes classifiers of nine discretization methods, equal width discretization (EWD), equal frequency discretization (EFD), fuzzy discretization (FD), entropy minimization discretization (EMD), iterative discretization (ID), proportional k-interval discretization (PKID), lazy discretization (LD), non-disjoint discretization (NDD) and weighted proportional k-interval discretization (WPKID). It is found that in general naive-Bayes classifiers trained on data preprocessed by LD, NDD or WPKID achieve lower classification error than those trained on data preprocessed by the other discretization methods. But LD can not scale to large data. This study leads to a new discretization method, weighted non-disjoint discretization (WNDD) that combines WPKID and NDD's advantages. Our experiments show that among all the rival discretization methods, WNDD best helps naive-Bayes classifiers reduce average classification error.

Pearce, J., Webb, G. I., Shaw, R., & Garner, B. (2002). A Framework for Experimentation and Self Learning in Continuous Database Marketing. Proceedings of the IEEE International Conference on Data Mining (ICDM-2002), Los Alamitos, CA, pp. 490-497.
[PDF] [Bibtex] [Abstract]

@InProceedings{PearceWebbShawGarner02b,
Title = {A Framework for Experimentation and Self Learning in Continuous Database Marketing},
Author = {J. Pearce and G. I. Webb and R. Shaw and B. Garner},
Booktitle = {Proceedings of the {IEEE} International Conference on Data Mining (ICDM-2002)},
Year = {2002},
Address = {Los Alamitos, CA},
Pages = {490-497},
Publisher = {{IEEE} Computer Society},
Abstract = {We present a method for continuous database marketing that identifies target customers for a number of marketing offers using predictive models. The algorithm then selects the appropriate offer for the customer. Experimental design principles are encapsulated to capture more information that will be used to monitor and refine the predictive models. The updated predictive models are then used for the next round of marketing offers.},
Audit-trail = {http://csdl.computer.org/comp/proceedings/icdm/2002/1754/00/1754toc.htm},
Location = {Maebashi City, Japan}
}
ABSTRACT We present a method for continuous database marketing that identifies target customers for a number of marketing offers using predictive models. The algorithm then selects the appropriate offer for the customer. Experimental design principles are encapsulated to capture more information that will be used to monitor and refine the predictive models. The updated predictive models are then used for the next round of marketing offers.

Brain, D., & Webb, G. I. (2002). The Need for Low Bias Algorithms in Classification Learning From Large Data Sets. Lecture Notes in Computer Science 2431: Principles of Data Mining and Knowledge Discovery: Proceedings of the Sixth European Conference (PKDD 2002), Berlin/Heidelberg, pp. 62-73.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{BrainWebb02,
Title = {The Need for Low Bias Algorithms in Classification Learning From Large Data Sets},
Author = { D. Brain and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 2431: Principles of Data Mining and Knowledge Discovery: Proceedings of the Sixth European Conference (PKDD 2002)},
Year = {2002},
Address = {Berlin/Heidelberg},
Pages = {62-73},
Publisher = {Springer-Verlag},
Abstract = {This paper reviews the appropriateness for application to large data sets of standard machine learning algorithms, which were mainly developed in the context of small data sets. Sampling and parallelization have proved useful means for reducing computation time when learning from large data sets. However, such methods assume that algorithms that were designed for use with what are now considered small data sets are also fundamentally suitable for large data sets. It is plausible that optimal learning from large data sets requires a different type of algorithm to optimal learning from small data sets. This paper investigates one respect in which data set size may affect the requirements of a learning algorithm û the bias plus variance decomposition of classification error. Experiments show that learning from large data sets may be more effective when using an algorithm that places greater emphasis on bias management, rather than variance management},
Audit-trail = {http://link.springer.de/link/service/series/0558/bibs/2431/24310062.htm},
Keywords = {Learning from large datasets and Bias-Variance},
Location = {Helsinki, Finland},
Related = {learning-from-large-datasets}
}
ABSTRACT This paper reviews the appropriateness for application to large data sets of standard machine learning algorithms, which were mainly developed in the context of small data sets. Sampling and parallelization have proved useful means for reducing computation time when learning from large data sets. However, such methods assume that algorithms that were designed for use with what are now considered small data sets are also fundamentally suitable for large data sets. It is plausible that optimal learning from large data sets requires a different type of algorithm to optimal learning from small data sets. This paper investigates one respect in which data set size may affect the requirements of a learning algorithm û the bias plus variance decomposition of classification error. Experiments show that learning from large data sets may be more effective when using an algorithm that places greater emphasis on bias management, rather than variance management

Frayman, Y., Rolfe, B., & Webb, G. I. (2002). Improving an Inverse Model of Sheet Metal Forming by Neural Network Based Regression. Proceedings of the Design Engineering Technical Conferences and Computer and Information in Engineering Conference (DETC'02/ASME 2002), New York, pp. 1-8.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{FraymanRolfeWebb02b,
Title = {Improving an Inverse Model of Sheet Metal Forming by Neural Network Based Regression},
Author = {Y Frayman and B. Rolfe and G. I. Webb},
Booktitle = {Proceedings of the Design Engineering Technical Conferences and Computer and Information in Engineering Conference (DETC'02/ASME 2002)},
Year = {2002},
Address = {New York},
Pages = {1-8},
Publisher = {ASME Press},
Abstract = {The inverse model for a sheet metal forming process aims to determine the initial parameter levels required to form the final formed shape. This is a difficult problem that is usually approached by traditional methods such a finite element analysis. Formulating the problem as a classification problem makes is possible to use a well established classification algorithms such as decision trees. The classification is, however, generally based on a winner-takes-all approach when associating the output value with the corresponding class. On the other hand when formulating the problem as a regression task, all the output values are combined to produce the corresponding class value. For a multi-class problem, this may result in very different associations between the output of the model and the corresponding class. Such formulation makes it possible to use a well known regression algorithms such as neural networks.In this paper, we develop a neural network based inverse model of a sheet forming process, and compare its performance with that of a linear model. Both models are used in two modes: classification mode and a function estimation mode to investigate the advantage of re-formulating the problem as function estimation. This results in large improvements in the recognition rate of set-up parameters of a sheet metal forming process for both models, with a neural network model achieving much more accurate parameters recognition than a linear model},
Audit-trail = {*},
Keywords = {Engineering Applications},
Related = {engineering-applications}
}
ABSTRACT The inverse model for a sheet metal forming process aims to determine the initial parameter levels required to form the final formed shape. This is a difficult problem that is usually approached by traditional methods such a finite element analysis. Formulating the problem as a classification problem makes is possible to use a well established classification algorithms such as decision trees. The classification is, however, generally based on a winner-takes-all approach when associating the output value with the corresponding class. On the other hand when formulating the problem as a regression task, all the output values are combined to produce the corresponding class value. For a multi-class problem, this may result in very different associations between the output of the model and the corresponding class. Such formulation makes it possible to use a well known regression algorithms such as neural networks.In this paper, we develop a neural network based inverse model of a sheet forming process, and compare its performance with that of a linear model. Both models are used in two modes: classification mode and a function estimation mode to investigate the advantage of re-formulating the problem as function estimation. This results in large improvements in the recognition rate of set-up parameters of a sheet metal forming process for both models, with a neural network model achieving much more accurate parameters recognition than a linear model

Yang, Y., & Webb, G. I. (2002). Non-Disjoint Discretization for Naive-Bayes Classifiers. Proceedings of the Nineteenth International Conference on Machine Learning (ICML '02), San Francisco, pp. 666-673.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebb02b,
Title = {Non-Disjoint Discretization for Naive-Bayes Classifiers},
Author = {Y. Yang and G. I. Webb},
Booktitle = {Proceedings of the Nineteenth International Conference on Machine Learning (ICML '02)},
Year = {2002},
Address = {San Francisco},
Editor = {C. Sammut and A.G. Hoffmann},
Pages = {666-673},
Publisher = {Morgan Kaufmann},
Abstract = {Previous discretization techniques have discretized numeric attributes into disjoint intervals. We argue that this is neither necessary nor appropriate for naive-Bayes classifiers. The analysis leads to a new discretization method, Non-Disjoint Discretization (NDD). NDD forms overlapping intervals for a numeric attribute, always locating a value toward the middle of its discretized interval to obtain more reliable probability estimation. It also adjusts the number and size of discretized intervals to the number of training instances, seeking an appropriate trade-off between bias and variance of probability estimation. We justify NDD in theory and test it on a wide cross-section of datasets. Our experimental results suggest that for naive-Bayes classifiers, NDD works better than alternative discretization approaches.},
Audit-trail = {Posted by Ying at http://www.cs.uvm.edu/~yyang/ndd.pdf No link on GW page - 9/2/05 requested permission},
Keywords = {Discretization for Naive Bayes},
Location = {Sydney, Australia},
Related = {discretization-for-naive-bayes}
}
ABSTRACT Previous discretization techniques have discretized numeric attributes into disjoint intervals. We argue that this is neither necessary nor appropriate for naive-Bayes classifiers. The analysis leads to a new discretization method, Non-Disjoint Discretization (NDD). NDD forms overlapping intervals for a numeric attribute, always locating a value toward the middle of its discretized interval to obtain more reliable probability estimation. It also adjusts the number and size of discretized intervals to the number of training instances, seeking an appropriate trade-off between bias and variance of probability estimation. We justify NDD in theory and test it on a wide cross-section of datasets. Our experimental results suggest that for naive-Bayes classifiers, NDD works better than alternative discretization approaches.

Rolfe, B., Frayman, Y., Hodgson, P., & Webb, G. I. (2002). Fault Detection in a Cold Forging Process Through Feature Extraction with a Neural Network. Proceedings of the IASTED International Conference on Artificial Intelligence and Applications (AIA 2002), Calgary, Canada, pp. 155-159.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{RolfeFraymanHodgsonWebb02,
Title = {Fault Detection in a Cold Forging Process Through Feature Extraction with a Neural Network},
Author = {B. Rolfe and Y Frayman and P. Hodgson and G. I. Webb},
Booktitle = {Proceedings of the IASTED International Conference on Artificial Intelligence and Applications ({AIA} 2002)},
Year = {2002},
Address = {Calgary, Canada},
Pages = {155-159},
Publisher = {ACTA Press},
Abstract = {This paper investigates the application of neural networks to the recognition of lubrication defects typical to an industrial cold forging process employed by fastener manufacturers. The accurate recognition of lubrication errors, such as coating not being applied properly or damaged during material handling, is very important to the quality of the final product in fastener manufacture. Lubrication errors lead to increased forging loads and premature tool failure, as well as to increased defect sorting and the re-processing of the coated rod. The lubrication coating provides a barrier between the work material and the die during the drawing operation; moreover it needs be sufficiently robust to remain on the wire during the transfer to the cold forging operation. In the cold forging operation the wire undergoes multi-stage deformation without the application of any additional lubrication. Four types of lubrication errors, typical to production of fasteners, were introduced to a set of sample rods, which were subsequently drawn under laboratory conditions. The drawing force was measured, from which a limited set of features was extracted. The neural network based model learned from these features is able to recognize all types of lubrication errors to a high accuracy. The overall accuracy of the neural network model is around 98% with almost uniform distribution of errors between all four errors and the normal condition.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Benalmßdena, Spain},
Related = {engineering-applications}
}
ABSTRACT This paper investigates the application of neural networks to the recognition of lubrication defects typical to an industrial cold forging process employed by fastener manufacturers. The accurate recognition of lubrication errors, such as coating not being applied properly or damaged during material handling, is very important to the quality of the final product in fastener manufacture. Lubrication errors lead to increased forging loads and premature tool failure, as well as to increased defect sorting and the re-processing of the coated rod. The lubrication coating provides a barrier between the work material and the die during the drawing operation; moreover it needs be sufficiently robust to remain on the wire during the transfer to the cold forging operation. In the cold forging operation the wire undergoes multi-stage deformation without the application of any additional lubrication. Four types of lubrication errors, typical to production of fasteners, were introduced to a set of sample rods, which were subsequently drawn under laboratory conditions. The drawing force was measured, from which a limited set of features was extracted. The neural network based model learned from these features is able to recognize all types of lubrication errors to a high accuracy. The overall accuracy of the neural network model is around 98% with almost uniform distribution of errors between all four errors and the normal condition.

Frayman, Y., Rolfe, B., Hodgson, P., & Webb, G. I. (2002). Predicting The Rolling Force in Hot Steel Rolling Mill using an Ensemble Model. Proceedings of the Second IASTED International Conference on Artificial Intelligence and Applications (AIA '02), Calgary, Canada, pp. 143-148.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{FraymanRolfeHodgsonWebb02c,
Title = {Predicting The Rolling Force in Hot Steel Rolling Mill using an Ensemble Model},
Author = {Y. Frayman and B. Rolfe and P. Hodgson and G. I. Webb},
Booktitle = {Proceedings of the Second IASTED International Conference on Artificial Intelligence and Applications (AIA '02)},
Year = {2002},
Address = {Calgary, Canada},
Pages = {143-148},
Publisher = {ACTA Press},
Abstract = {Accurate prediction of the roll separating force is critical to assuring the quality of the final product in steel manufacturing. This paper presents an ensemble model that addresses these concerns. A stacked generalisation approach to ensemble modeling is used with two sets of the ensemble model members, the first set being learnt from the current input-output data of the hot rolling finishing mill, while another uses the available information on the previous coil in addition to the current information. Both sets of ensemble members include linear regression, multilayer perceptron, and k-nearest neighbor algorithms. A competitive selection model (multilayer perceptron) is then used to select the output from one of the ensemble members to be the final output of the ensemble model. The ensemble model created by such a stacked generalization is able to achieve extremely high accuracy in predicting the roll separation force with the average relative accuracy being within 1% of the actual measured roll force.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Benalmßdena, Spain},
Related = {engineering-applications}
}
ABSTRACT Accurate prediction of the roll separating force is critical to assuring the quality of the final product in steel manufacturing. This paper presents an ensemble model that addresses these concerns. A stacked generalisation approach to ensemble modeling is used with two sets of the ensemble model members, the first set being learnt from the current input-output data of the hot rolling finishing mill, while another uses the available information on the previous coil in addition to the current information. Both sets of ensemble members include linear regression, multilayer perceptron, and k-nearest neighbor algorithms. A competitive selection model (multilayer perceptron) is then used to select the output from one of the ensemble members to be the final output of the ensemble model. The ensemble model created by such a stacked generalization is able to achieve extremely high accuracy in predicting the roll separation force with the average relative accuracy being within 1% of the actual measured roll force.

Webb, G. I., & Brain, D. (2002). Generality is Predictive of Predication Accuracy. Proceedings of the 2002 Pacific Rim Knowledge Acquisition Workshop (PKAW'02), Tokyo, pp. 117-130.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbBrain02,
Title = {Generality is Predictive of Predication Accuracy},
Author = {G. I. Webb and D. Brain},
Booktitle = {Proceedings of the 2002 {Pacific} Rim Knowledge Acquisition Workshop (PKAW'02)},
Year = {2002},
Address = {Tokyo},
Editor = {T. Yamaguchi and A. Hoffmann and H. Motoda and P. Compton },
Pages = {117-130},
Publisher = {Japanese Society for Artificial Intelligence},
Abstract = {There has been a dearth of research into the relative impacts of alternative high level learning biases. This paper presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. It is argued that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. It is also argued that the accuracy on unseen cases of the more specific rule will tend to be closer to the accuracy obtained on training data than will the accuracy of the more general rule. Experimental evidence is provided in support of these hypotheses. We argue that these hypotheses can be of use in selecting appropriate learning biases to achieve specific learning objectives.},
Audit-trail = {*},
Keywords = {Occams Razor and Generality},
Location = {Tokyo, Japan},
Related = {generality-is-predictive-of-prediction-accuracy}
}
ABSTRACT There has been a dearth of research into the relative impacts of alternative high level learning biases. This paper presents two hypotheses about the expected impact of selecting between classification rules of differing levels of generality in the absence of other evidence about their likely relative performance on unseen data. It is argued that the accuracy on unseen data of the more general rule will tend to be closer to that of a default rule for the class than will that of the more specific rule. It is also argued that the accuracy on unseen cases of the more specific rule will tend to be closer to the accuracy obtained on training data than will the accuracy of the more general rule. Experimental evidence is provided in support of these hypotheses. We argue that these hypotheses can be of use in selecting appropriate learning biases to achieve specific learning objectives.

Webb, G. I. (2002). Integrating Machine Learning with Knowledge Acquisition. In Leondes, C. T. (Ed.), In Expert Systems (, Vol. 3, pp. 937-959). San Diego, CA: Academic Press.
[PDF] [Bibtex]  → Related papers and software

@InCollection{Webb02,
Title = {Integrating Machine Learning with Knowledge Acquisition},
Author = {G. I. Webb},
Booktitle = {Expert Systems },
Publisher = {Academic Press},
Year = {2002},
Address = {San Diego, CA},
Editor = {C. T. Leondes},
Pages = {937-959},
Volume = {3},
Audit-trail = {23/8 waiting on permission to post PDF. Received permission and posted PDF},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Machine Learning},
Related = {interactive-machine-learning}
}
ABSTRACT 

Webb, G. I., Boughton, J., & Wang, Z. (2002). Averaged One-Dependence Estimators: Preliminary Results. Proceedings of the First Australasian Data Mining Workshop (AusDM02), Sydney, pp. 65-73.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbBoughtonWang02,
Title = {Averaged One-Dependence Estimators: Preliminary Results},
Author = {G. I. Webb and J. Boughton and Z. Wang},
Booktitle = {Proceedings of the First Australasian Data Mining Workshop (AusDM02)},
Year = {2002},
Address = {Sydney},
Editor = {S.J Simoff and G.J Williams and M. Hegland },
Pages = {65-73},
Publisher = {University of Technology},
Abstract = {Naive Bayes is a simple, computationally efficient and remarkably accurate approach to classification learning. These properties have led to its wide deployment in many online applications. However, it is based on an assumption that all attributes are conditionally independent given the class. This assumption leads to decreased accuracy in some applications. AODE overcomes the attribute independence assumption of naive Bayes by averaging over all models in which all attributes depend upon the class and a single other attribute. The resulting classification learning algorithm for nominal data is computationally efficient and achieves very low error rates.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation and AODE},
Location = {Canberra, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Naive Bayes is a simple, computationally efficient and remarkably accurate approach to classification learning. These properties have led to its wide deployment in many online applications. However, it is based on an assumption that all attributes are conditionally independent given the class. This assumption leads to decreased accuracy in some applications. AODE overcomes the attribute independence assumption of naive Bayes by averaging over all models in which all attributes depend upon the class and a single other attribute. The resulting classification learning algorithm for nominal data is computationally efficient and achieves very low error rates.

Frayman, Y., Rolfe, B., & Webb, G. I. (2002). Solving Regression Problems using Competitive Ensemble Models. Lecture Notes in Computer Science Vol. 2557: Proceedings of the 15th Australian Joint Conference on Artificial Intelligence (AI 02), Berlin/Heidelberg, pp. 511-522.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{FraymanRolfeWebb02,
Title = {Solving Regression Problems using Competitive Ensemble Models},
Author = {Y Frayman and B. Rolfe and G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 2557: Proceedings of the 15th Australian Joint Conference on Artificial Intelligence (AI 02)},
Year = {2002},
Address = {Berlin/Heidelberg},
Editor = {B. McKay and J.K. Slaney },
Pages = {511-522},
Publisher = {Springer},
Abstract = {The use of ensemble models in many problem domains has increased significantly in the last few years. The ensemble modelling, in particularly boosting, has shown a great promise in improving predictive performance of a model. Combining the ensemble members is normally done in a co-operative fashion where each of the ensemble members performs the same task and their predictions are aggregated to obtain the improved performance. However, it is also possible to combine the ensemble members in a competitive fashion where the best prediction of a relevant ensemble member is selected for a particular input. This option has been previously somewhat overlooked. The aim of this article is to investigate and compare the competitive and co-operative approaches to combining the models in the ensemble. A comparison is made between a competitive ensemble model and that of MARS with bagging, mixture of experts, hierarchical mixture of experts and a neural network ensemble over several public domain regression problems that have a high degree of nonlinearity and noise. The empirical results show a substantial advantage of competitive learning versus the co-operative learning for all the regression problems investigated. The requirements for creating the efficient ensembles and the available guidelines are also discussed.},
Audit-trail = {*},
Keywords = {Engineering Applications},
Location = {Canberra, Australia},
Related = {engineering-applications}
}
ABSTRACT The use of ensemble models in many problem domains has increased significantly in the last few years. The ensemble modelling, in particularly boosting, has shown a great promise in improving predictive performance of a model. Combining the ensemble members is normally done in a co-operative fashion where each of the ensemble members performs the same task and their predictions are aggregated to obtain the improved performance. However, it is also possible to combine the ensemble members in a competitive fashion where the best prediction of a relevant ensemble member is selected for a particular input. This option has been previously somewhat overlooked. The aim of this article is to investigate and compare the competitive and co-operative approaches to combining the models in the ensemble. A comparison is made between a competitive ensemble model and that of MARS with bagging, mixture of experts, hierarchical mixture of experts and a neural network ensemble over several public domain regression problems that have a high degree of nonlinearity and noise. The empirical results show a substantial advantage of competitive learning versus the co-operative learning for all the regression problems investigated. The requirements for creating the efficient ensembles and the available guidelines are also discussed.

Pearce, J., Webb, G. I., Shaw, R., & Garner, B. (2002). A Systemic Approach to the Database Marketing Process. Proceedings of the Australian and New Zealand Marketing Academy Conference (ANZMAC 02), Geelong, Victoria, pp. pp 2941-2948.
[PDF] [Bibtex] [Abstract]

@InProceedings{PearceWebbShawGarner02,
Title = {A Systemic Approach to the Database Marketing Process},
Author = {J. Pearce and G. I. Webb and R. Shaw and B. Garner},
Booktitle = {Proceedings of the Australian and New Zealand Marketing Academy Conference (ANZMAC 02)},
Year = {2002},
Address = {Geelong, Victoria},
Pages = {pp 2941-2948},
Publisher = {Deakin University (CD Rom)},
Abstract = {The role of database marketing (DBM) has become increasingly important for organisations that have large databases of information on customers with whom they deal directly. At the same time, DBM models used in practice have increased in sophistication. This paper examines a systemic view of DBM and the role of analytical techniques within DBM. It extends existing process models to develop a systemic model that encompasses the increased complexity of DBM in practice. The systemic model provides a framework to integrate data mining, experimental design and prioritisation decisions. This paper goes on to identify opportunities for research in DBM, including DBM process models used in practice, the use of evolutionary operations techniques in DBM, prioritisation decisions, and the factors that surround the uptake of DBM.},
Audit-trail = {*},
Location = {Geelong, Australia}
}
ABSTRACT The role of database marketing (DBM) has become increasingly important for organisations that have large databases of information on customers with whom they deal directly. At the same time, DBM models used in practice have increased in sophistication. This paper examines a systemic view of DBM and the role of analytical techniques within DBM. It extends existing process models to develop a systemic model that encompasses the increased complexity of DBM in practice. The systemic model provides a framework to integrate data mining, experimental design and prioritisation decisions. This paper goes on to identify opportunities for research in DBM, including DBM process models used in practice, the use of evolutionary operations techniques in DBM, prioritisation decisions, and the factors that surround the uptake of DBM.

Wang, Z., & Webb, G. I. (2002). Comparison of Lazy Bayesian Rule Learning and Tree-Augmented Bayesian Learning. Proceedings of the IEEE International Conference on Data Mining (ICDM-2002), Los Alamitos, CA, pp. 775-778.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebb02,
Title = {Comparison of Lazy Bayesian Rule Learning and Tree-Augmented Bayesian Learning},
Author = { Z. Wang and G.I. Webb},
Booktitle = {Proceedings of the {IEEE} International Conference on Data Mining (ICDM-2002)},
Year = {2002},
Address = {Los Alamitos, CA},
Pages = {775-778},
Publisher = {{IEEE} Computer Society},
Abstract = {The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. Among these, Lazy Bayesian Rules (LBR) and Tree-Augmented Na?ve-Bayes (TAN) have demonstrated strong prediction accuracy. However, their relative performance has never been evaluated. This paper compares and contrasts these two techniques, finding that they have comparable accuracy and hence should be selected according to computational profile. LBR is desirable when small numbers of objects are to be classified while TAN is desirable when large numbers of objects are to be classified},
Audit-trail = {http://csdl.computer.org/comp/proceedings/icdm/2002/1754/00/1754toc.htm},
Keywords = {Conditional Probability Estimation},
Location = {Maebashi City, Japan},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The naive Bayes classifier is widely used in interactive applications due to its computational efficiency, direct theoretical base, and competitive accuracy. However, its attribute independence assumption can result in sub-optimal accuracy. A number of techniques have explored simple relaxations of the attribute independence assumption in order to increase accuracy. Among these, Lazy Bayesian Rules (LBR) and Tree-Augmented Na?ve-Bayes (TAN) have demonstrated strong prediction accuracy. However, their relative performance has never been evaluated. This paper compares and contrasts these two techniques, finding that they have comparable accuracy and hence should be selected according to computational profile. LBR is desirable when small numbers of objects are to be classified while TAN is desirable when large numbers of objects are to be classified

Wang, Z., & Webb, G. I. (2002). A Heuristic Lazy Bayesian Rules Algorithm. Proceedings of the First Australasian Data Mining Workshop (AusDM02), Sydney, pp. 57-63.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebb02b,
Title = {A Heuristic Lazy Bayesian Rules Algorithm},
Author = {Z. Wang and G. I. Webb},
Booktitle = {Proceedings of the First Australasian Data Mining Workshop (AusDM02)},
Year = {2002},
Address = {Sydney},
Editor = {S.J Simoff and G.J Williams and M. Hegland },
Pages = {57-63},
Publisher = {University of Technology},
Abstract = {Lazy Bayesian rule has demonstrated outstanding classification accuracy. However, it has high computational overheads when large numbers of instances are classified from a single training set. We compare lazy Bayesian rule and the tree-augmented Bayesian classifier, and present a new heuristic lazy Bayesian rule classifier that combines elements of the two. It requires less computation than lazy Bayesian rule, but demonstrates similar prediction accuracy.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation},
Location = {Canberra, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Lazy Bayesian rule has demonstrated outstanding classification accuracy. However, it has high computational overheads when large numbers of instances are classified from a single training set. We compare lazy Bayesian rule and the tree-augmented Bayesian classifier, and present a new heuristic lazy Bayesian rule classifier that combines elements of the two. It requires less computation than lazy Bayesian rule, but demonstrates similar prediction accuracy.

Webb, G. I. (2001). Discovering Associations with Numeric Variables. Proceedings of the Seventh ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2001)[short paper], New York, pp. 383-388.
[PDF] [URL] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb01a,
Title = {Discovering Associations with Numeric Variables},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Seventh {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2001)[short paper]},
Year = {2001},
Address = {New York},
Editor = {F. Provost and R. Srikant},
Pages = {383-388},
Publisher = {The Association for Computing Machinery},
Abstract = {This paper further develops Aumann and Lindell's [3] proposal for a variant of association rules for which the consequent is a numeric variable. It is argued that these rules can discover useful interactions with numeric data that cannot be discovered directly using traditional association rules with discretization. Alternative measures for identifying interesting rules are proposed. Efficient algorithms are presented that enable these rules to be discovered for dense data sets for which application of Auman and Lindell's algorithm is infeasible.},
Audit-trail = {*},
Keywords = {Impact Rules and OPUS and Association Rule Discovery},
Location = {San Francisco, CA},
Related = {impact-rules},
Url = {http://dl.acm.org/authorize?19861}
}
ABSTRACT This paper further develops Aumann and Lindell's [3] proposal for a variant of association rules for which the consequent is a numeric variable. It is argued that these rules can discover useful interactions with numeric data that cannot be discovered directly using traditional association rules with discretization. Alternative measures for identifying interesting rules are proposed. Efficient algorithms are presented that enable these rules to be discovered for dense data sets for which application of Auman and Lindell's algorithm is infeasible.

Webb, G. I. (2001). Candidate Elimination Criteria for Lazy Bayesian Rules. Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01), Berlin/Heidelberg, pp. 545-556.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb01b,
Title = {Candidate Elimination Criteria for Lazy Bayesian Rules},
Author = {G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01)},
Year = {2001},
Address = {Berlin/Heidelberg},
Editor = {M. Stumptner and D. Corbett and M.J. Brooks},
Pages = {545-556},
Publisher = {Springer},
Abstract = {Lazy Bayesian Rules modify naive Bayesian classification to undo elements of the harmful attribute independence assumption. It has been shown to provide classification error comparable to boosting decision trees. This paper explores alternatives to the candidate elimination criterion employed within Lazy Bayesian Rules. Improvements over naive Bayes are consistent so long as the candidate elimination criteria ensures there is sufficient data for accurate probability estimation. However, the original candidate elimination criterion is demonstrated to provide better overall error reduction than the use of a minimum data subset size criterion. },
Audit-trail = {*},
Doi = {10.1007%2F3-540-45656-2_47},
Keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
Location = {Adelaide, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Lazy Bayesian Rules modify naive Bayesian classification to undo elements of the harmful attribute independence assumption. It has been shown to provide classification error comparable to boosting decision trees. This paper explores alternatives to the candidate elimination criterion employed within Lazy Bayesian Rules. Improvements over naive Bayes are consistent so long as the candidate elimination criteria ensures there is sufficient data for accurate probability estimation. However, the original candidate elimination criterion is demonstrated to provide better overall error reduction than the use of a minimum data subset size criterion.

Wang, Z., Webb, G. I., & Dai, H. (2001). Implementation of Lazy Bayesian Rules in the Weka System. Software Technology Catering for 21st Century: Proceedings of the International Symposium on Future Software Technology (ISFST2001), Tokyo, pp. 204-208.
[Bibtex] [Abstract]  → Related papers and software

@InProceedings{WangWebbDai01,
Title = {Implementation of Lazy Bayesian Rules in the Weka System},
Author = {Z. Wang and G. I. Webb and H. Dai},
Booktitle = {Software Technology Catering for 21st Century: Proceedings of the International Symposium on Future Software Technology (ISFST2001)},
Year = {2001},
Address = {Tokyo},
Pages = {204-208},
Publisher = {Software Engineers Association},
Abstract = {The na?ve Bayesian classification algorithms were shown to be computationally efficient and surprisingly accurate when the conditional independence assumption on which they are based is violated. The lazy Bayesian rule is the application of lazy learning techniques to Bayesian tree induction, which supports a weaker conditional attribute independence assumption. The Weka system is a full, industrial-strength implementation of essentially almost the state-of-the-art machine learning techniques, and it contains a framework, in the form of a Java class library, which supports applications that use embedded machine learning and even the implementation of new learning schemes. In this paper, we mainly discuss the implementation of the algorithm of lazy Bayesian rule in Weka System, and introduce all the methods to be used in the Java class. This is the first lazy learning scheme implemented in Weka System.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation},
Location = {Zheng Zhou, China},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The na?ve Bayesian classification algorithms were shown to be computationally efficient and surprisingly accurate when the conditional independence assumption on which they are based is violated. The lazy Bayesian rule is the application of lazy learning techniques to Bayesian tree induction, which supports a weaker conditional attribute independence assumption. The Weka system is a full, industrial-strength implementation of essentially almost the state-of-the-art machine learning techniques, and it contains a framework, in the form of a Java class library, which supports applications that use embedded machine learning and even the implementation of new learning schemes. In this paper, we mainly discuss the implementation of the algorithm of lazy Bayesian rule in Weka System, and introduce all the methods to be used in the Java class. This is the first lazy learning scheme implemented in Weka System.

Yang, Y., & Webb, G. I. (2001). Proportional K-Interval Discretization for Naive-Bayes Classifiers. Lecture Notes in Computer Science 2167: Proceedings of the 12th European Conference on Machine Learning (ECML'01), Berlin/Heidelberg, pp. 564-575.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YangWebb01,
Title = {Proportional K-Interval Discretization for Naive-Bayes Classifiers},
Author = {Y. Yang and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 2167: Proceedings of the 12th European Conference on Machine Learning (ECML'01)},
Year = {2001},
Address = {Berlin/Heidelberg},
Editor = {L. DeRaedt and P. A. Flach },
Pages = {564-575},
Publisher = {Springer-Verlag},
Abstract = {This paper argues that two commonly-used discretization approaches, fixed k-interval discretization and entropy-based discretization have sub-optimal characteristics for naive-Bayes classification. This analysis leads to a new discretization method, Proportional k-Interval Discretization (PKID), which adjusts the number and size of discretized intervals to the number of training instances, thus seeks an appropriate trade-off between the bias and variance of the probability estimation for naive-Bayes classifiers. We justify PKID in theory, as well as test it on a wide cross-section of datasets. Our experimental results suggest that in comparison to its alternatives, PKID provides naive-Bayes classifiers competitive classification performance for smaller datasets and better classification performance for larger datasets.},
Audit-trail = {http://link.springer.de/link/service/series/0558/bibs/2167/21670564.htm},
Keywords = {Discretization for Naive Bayes},
Location = {Freiburg, Germany},
Related = {discretization-for-naive-bayes}
}
ABSTRACT This paper argues that two commonly-used discretization approaches, fixed k-interval discretization and entropy-based discretization have sub-optimal characteristics for naive-Bayes classification. This analysis leads to a new discretization method, Proportional k-Interval Discretization (PKID), which adjusts the number and size of discretized intervals to the number of training instances, thus seeks an appropriate trade-off between the bias and variance of the probability estimation for naive-Bayes classifiers. We justify PKID in theory, as well as test it on a wide cross-section of datasets. Our experimental results suggest that in comparison to its alternatives, PKID provides naive-Bayes classifiers competitive classification performance for smaller datasets and better classification performance for larger datasets.

Webb, G. I., & Zhang, S. (2001). Further Pruning for Efficient Association Rule Discovery. Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01), Berlin, pp. 605-618.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbZhang01,
Title = {Further Pruning for Efficient Association Rule Discovery},
Author = {G.I. Webb and S. Zhang},
Booktitle = {Lecture Notes in Computer Science Vol. 2256: Proceedings of the 14th Australian Joint Conference on Artificial Intelligence (AI'01)},
Year = {2001},
Address = {Berlin},
Editor = {M. Stumptner and D. Corbett and M.J. Brooks},
Pages = {605-618},
Publisher = {Springer},
Abstract = {The Apriori algorithm's frequent itemset approach has become the standard approach to discovering association rules. However, the computation requirements of the frequent itemset approach are infeasible for dense data and the approach is unable to discover infrequent associations. OPUS\_AR is an efficient algorithm for rule discovery that does not utilize frequent itemsets and hence avoids these problems. It can reduce search time by using additional constraints on the search space as well as constraints on itemset frequency. However, the effectiveness of the pruning rules used during search will determine the efficiency of its search. This paper presents and analyzes pruning rules for use with OPUS\_AR. We demonstrate that application of OPUS\_AR is feasible for a number of datasets for which application of the frequent itemset approach is infeasible and that the new pruning rules can reduce compute time by more than 40%.},
Audit-trail = {*},
Keywords = {OPUS and Association Rule Discovery},
Location = {Adelaide, Australia},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT The Apriori algorithm's frequent itemset approach has become the standard approach to discovering association rules. However, the computation requirements of the frequent itemset approach are infeasible for dense data and the approach is unable to discover infrequent associations. OPUS_AR is an efficient algorithm for rule discovery that does not utilize frequent itemsets and hence avoids these problems. It can reduce search time by using additional constraints on the search space as well as constraints on itemset frequency. However, the effectiveness of the pruning rules used during search will determine the efficiency of its search. This paper presents and analyzes pruning rules for use with OPUS_AR. We demonstrate that application of OPUS_AR is feasible for a number of datasets for which application of the frequent itemset approach is infeasible and that the new pruning rules can reduce compute time by more than 40%.

Webb, G. I., Pazzani, M. J., & Billsus, D. (2001). Machine learning for user modeling. User Modeling and User-Adapted Interaction, 11, 19-20.
[PDF] [DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbPazzaniBillsus01,
Title = {Machine learning for user modeling},
Author = {G. I. Webb and M. J. Pazzani and D. Billsus},
Journal = {User Modeling and User-Adapted Interaction},
Year = {2001},
Pages = {19-20},
Volume = {11},
Abstract = {At first blush, user modeling appears to be a prime candidate for straight forward application of standard machine learning techniques. Observations of the user's behavior can provide training examples that a machine learning system can use to form a model designed to predict future actions. However, user modeling poses a number of challenges for machine learning that have hindered its application in user modeling, including: the need for large data sets; the need for labelled data; concept drift; and computational complexity. This paper examines each of these issues and reviews approaches to resolving them.},
Address = {Netherlands},
Audit-trail = {Link to pdf via UMUAI site. Also available at http://www.kluweronline.com/issn/0924-1868},
Doi = {10.1023/A:1011117102175},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {Springer},
Related = {feature-based-modeling}
}
ABSTRACT At first blush, user modeling appears to be a prime candidate for straight forward application of standard machine learning techniques. Observations of the user's behavior can provide training examples that a machine learning system can use to form a model designed to predict future actions. However, user modeling poses a number of challenges for machine learning that have hindered its application in user modeling, including: the need for large data sets; the need for labelled data; concept drift; and computational complexity. This paper examines each of these issues and reviews approaches to resolving them.

Smith, P. A., & Webb, G. I. (2000). The Efficacy of a Low-Level Program Visualization Tool for Teaching Programming Concepts to Novice C Programmers. Journal of Educational Computing Research, 22(2), 187-215.
[PDF] [Bibtex]  → Related papers and software

@Article{SmithWebb00,
Title = {The Efficacy of a Low-Level Program Visualization Tool for Teaching Programming Concepts to Novice C Programmers},
Author = {P. A. Smith and G. I. Webb},
Journal = {Journal of Educational Computing Research},
Year = {2000},
Number = {2},
Pages = {187-215},
Volume = {22},
Audit-trail = {Link to pdf via Baywood Publishing Company},
Keywords = {Program Visualisation},
Publisher = {Baywood Publishing},
Related = {program-visualisation}
}
ABSTRACT 

Zheng, Z., & Webb, G. I. (2000). Lazy Learning of Bayesian Rules. Machine Learning, 41(1), 53-84.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{ZhengWebb00,
Title = {Lazy Learning of Bayesian Rules},
Author = {Z. Zheng and G. I. Webb},
Journal = {Machine Learning},
Year = {2000},
Number = {1},
Pages = {53-84},
Volume = {41},
Abstract = {The naive Bayesian classifier provides a simple and effective approach to classifier learning, but its attribute independence assumption is often violated in the real world. A number of approaches have sought to alleviate this problem. A Bayesian tree learning algorithm builds a decision tree, and generates a local naive Bayesian classifier at each leaf. The tests leading to a leaf can alleviate attribute integra¡dependencies for the local naive Bayesian classifier. However, Bayesian tree learning still suffers from the small disjunct problem of tree learning. While inferred Bayesian trees demonstrate low average prediction error rates, there is reason to believe that error rates will be higher for those leaves with few training examples. This paper proposes the application of lazy learning techniques to Bayesian tree induction and presents the resulting lazy Bayesian rule learning algorithm, called LBR. For each test example, it builds a most appropriate rule with a local naive Bayesian classifier as its consequent. It is demonstrated that the computational requirements of LBR are reasonable in a wide cross¡selection of natural domains. Experiments with these domains show that, on average, this new algorithm obtains lower error rates significantly more often than the reverse in comparison to a naive Bayesian classifier, C4.5, a Bayesian tree learning algorithm, a constructive Bayesian classifier that eliminates attributes and constructs new attributes using Cartesian products of existing nominal attributes, and a lazy decision tree learning algorithm. It also outperforms, although the result is not statistically significant, a selective naive Bayesian classifier.},
Address = {Netherlands},
Audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF posted 30/10/03},
Doi = {10.1023/A:1007613203719},
Keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
Publisher = {Springer},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT The naive Bayesian classifier provides a simple and effective approach to classifier learning, but its attribute independence assumption is often violated in the real world. A number of approaches have sought to alleviate this problem. A Bayesian tree learning algorithm builds a decision tree, and generates a local naive Bayesian classifier at each leaf. The tests leading to a leaf can alleviate attribute integra¡dependencies for the local naive Bayesian classifier. However, Bayesian tree learning still suffers from the small disjunct problem of tree learning. While inferred Bayesian trees demonstrate low average prediction error rates, there is reason to believe that error rates will be higher for those leaves with few training examples. This paper proposes the application of lazy learning techniques to Bayesian tree induction and presents the resulting lazy Bayesian rule learning algorithm, called LBR. For each test example, it builds a most appropriate rule with a local naive Bayesian classifier as its consequent. It is demonstrated that the computational requirements of LBR are reasonable in a wide cross¡selection of natural domains. Experiments with these domains show that, on average, this new algorithm obtains lower error rates significantly more often than the reverse in comparison to a naive Bayesian classifier, C4.5, a Bayesian tree learning algorithm, a constructive Bayesian classifier that eliminates attributes and constructs new attributes using Cartesian products of existing nominal attributes, and a lazy decision tree learning algorithm. It also outperforms, although the result is not statistically significant, a selective naive Bayesian classifier.

Webb, G. I. (2000). MultiBoosting: A Technique for Combining Boosting and Wagging. Machine Learning, 40(2), 159-196.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb00a,
Title = {MultiBoosting: A Technique for Combining Boosting and Wagging},
Author = {G. I. Webb},
Journal = {Machine Learning},
Year = {2000},
Number = {2},
Pages = {159-196},
Volume = {40},
Abstract = {MultiBoosting is an extension to the highly successful AdaBoost technique for forming decision committees. MultiBoosting can be viewed as combining AdaBoost with wagging. It is able to harness both AdaBoost's high bias and variance reduction with wagging's superior variance reduction. Using C4.5 as the base learning algorithm, Multi-boosting is demonstrated to produce decision committees with lower error than either AdaBoost or wagging significantly more often than the reverse over a large representative cross-section of UCI data sets. It offers the further advantage over AdaBoost of suiting parallel execution.},
Address = {Netherlands},
Audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF posted 30/10/03},
Doi = {10.1023/A:1007659514849},
Keywords = {MultiBoosting and Boosting and Bias-Variance},
Publisher = {Springer},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT MultiBoosting is an extension to the highly successful AdaBoost technique for forming decision committees. MultiBoosting can be viewed as combining AdaBoost with wagging. It is able to harness both AdaBoost's high bias and variance reduction with wagging's superior variance reduction. Using C4.5 as the base learning algorithm, Multi-boosting is demonstrated to produce decision committees with lower error than either AdaBoost or wagging significantly more often than the reverse over a large representative cross-section of UCI data sets. It offers the further advantage over AdaBoost of suiting parallel execution.

Webb, G. I. (2000). Efficient Search for Association Rules. Proceedings of the Sixth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining (KDD-2000), New York, pp. 99-107.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb00b,
Title = {Efficient Search for Association Rules},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Sixth {ACM} {SIGKDD} International Conference on Knowledge Discovery and Data Mining (KDD-2000)},
Year = {2000},
Address = {New York},
Editor = {R. Ramakrishnan and S. Stolfo},
Pages = {99-107},
Publisher = {The Association for Computing Machinery},
Abstract = {This paper argues that for some applications direct search for association rules can be more efficient than the two stage process of the Apriori algorithm which first finds large item sets which are then used to identify associations. In particular, it is argued, Apriori can impose large computational overheads when the number of frequent itemsets is very large. This will often be the case when association rule analysis is performed on domains other than basket analysis or when it is performed for basket analysis with basket information augmented by other customer information. An algorithm is presented that is computationally efficient for association rule analysis during which the number of rules to be found can be constrained and all data can be maintained in memory.},
Audit-trail = {*},
Keywords = {Search and OPUS and Association Rule Discovery},
Location = {Boston, MA},
Related = {filtered-top-k-association-discovery}
}
ABSTRACT This paper argues that for some applications direct search for association rules can be more efficient than the two stage process of the Apriori algorithm which first finds large item sets which are then used to identify associations. In particular, it is argued, Apriori can impose large computational overheads when the number of frequent itemsets is very large. This will often be the case when association rule analysis is performed on domains other than basket analysis or when it is performed for basket analysis with basket information augmented by other customer information. An algorithm is presented that is computationally efficient for association rule analysis during which the number of rules to be found can be constrained and all data can be maintained in memory.

Smith, P. A., & Webb, G. I. (1999). Evaluation of Low-Level Program Visualisation for Teaching Novice C Programmers. Proceedings of the Seventh International Conference on Computers in Education (ICCE '99), Amsterdam, pp. 385-392.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{SmithWebb99,
Title = {Evaluation of Low-Level Program Visualisation for Teaching Novice C Programmers},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {Proceedings of the Seventh International Conference on Computers in Education (ICCE '99)},
Year = {1999},
Address = {Amsterdam},
Editor = {G. Cumming and T. Okamoto and L. Gomez},
Pages = {385-392},
Publisher = {IOS Press},
Volume = {2},
Abstract = {While several program visualisation tools aimed at novice programmers have been developed over the past decade there is little empirical evidence showing that novices actually benefit from their use (Mulholland, 1995). Bradman (Smith & Webb, 1998) is a low-level program visualisation tool. We present an experiment that tests the efficacy of Bradman in assisting novice programmers learn programming concepts. We show that students with access to this lowlevel program visualisation tool achieved greater understanding of some programming concepts than those without access.},
Audit-trail = {pdf on file is early stage with corrections},
Keywords = {Program Visualisation},
Location = {Chiba, Japan},
Related = {program-visualisation}
}
ABSTRACT While several program visualisation tools aimed at novice programmers have been developed over the past decade there is little empirical evidence showing that novices actually benefit from their use (Mulholland, 1995). Bradman (Smith & Webb, 1998) is a low-level program visualisation tool. We present an experiment that tests the efficacy of Bradman in assisting novice programmers learn programming concepts. We show that students with access to this lowlevel program visualisation tool achieved greater understanding of some programming concepts than those without access.

Webb, G. I., Wells, J., & Zheng, Z. (1999). An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition. Machine Learning, 35(1), 5-24.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbWellsZheng99,
Title = {An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition},
Author = {G. I. Webb and J. Wells and Z. Zheng},
Journal = {Machine Learning},
Year = {1999},
Number = {1},
Pages = {5-24},
Volume = {35},
Abstract = {Machine learning and knowledge acquisition from experts have distinct capabilities that appear to complement one another. We report a study that demonstrates the integration of these approaches can both improve the accuracy of the developed knowledge base and reduce development time. In addition, we found that users expected the expert systems created through the integrated approach to have higher accuracy than those created without machine learning and rated the integrated approach less difficult to use. They also provided favorable evaluations of both the specific integrated software, system called The Knowledge Factory, and of the general value of machine learning for knowledge acquisition.},
Address = {Netherlands},
Audit-trail = {27/10/03 requested permission to post pp pdf. 28/10/03 Permission granted by Kluwer. PDF Posted 30/10/03},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
Publisher = {Springer},
Related = {interactive-machine-learning}
}
ABSTRACT Machine learning and knowledge acquisition from experts have distinct capabilities that appear to complement one another. We report a study that demonstrates the integration of these approaches can both improve the accuracy of the developed knowledge base and reduce development time. In addition, we found that users expected the expert systems created through the integrated approach to have higher accuracy than those created without machine learning and rated the integrated approach less difficult to use. They also provided favorable evaluations of both the specific integrated software, system called The Knowledge Factory, and of the general value of machine learning for knowledge acquisition.

Zheng, Z., Webb, G. I., & Ting, K. M. (1999). Lazy Bayesian Rules: A Lazy Semi-Naive Bayesian Learning Technique Competitive to Boosting Decision Trees. Proceedings of the Sixteenth International Conference on Machine Learning (ICML-99), San Francisco, pp. 493-502.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebbTing99,
Title = {Lazy Bayesian Rules: A Lazy Semi-Naive Bayesian Learning Technique Competitive to Boosting Decision Trees},
Author = {Z. Zheng and G. I. Webb and K. M. Ting},
Booktitle = {Proceedings of the Sixteenth International Conference on Machine Learning (ICML-99)},
Year = {1999},
Address = {San Francisco},
Editor = {I. Bratko and S. Dzeroski},
Pages = {493-502},
Publisher = {Morgan Kaufmann},
Abstract = {LBR is a lazy semi-naive Bayesian classifier learning technique, designed to alleviate the attribute interdependence problem of naive Bayesian classification. To classify a test example, it creates a conjunctive rule that selects a most appropriate subset of training examples and induces a local naive Bayesian classifier using this subset. LBR can significantly improve the performance of the naive Bayesian classifier. A bias and variance analysis of LBR reveals that it significantly reduces the bias of naive Bayesian classification at a cost of a slight increase in variance. It is interesting to compare this lazy technique with boosting and bagging, two well-known state-of-the-art non-lazy learning techniques. Empirical comparison of LBR with boosting decision trees on discrete valued data shows that LBR has, on average, significantly lower variance and higher bias. As a result of the interaction of these effects, the average prediction error of LBR over a range of learning tasks is at a level directly comparable to boosting. LBR provides a very competitive discrete valued learning technique where error minimization is the primary concern. It is very efficient when a single classifier is to be applied to classify few cases, such as in a typical incremental learning scenario.},
Audit-trail = {Link via Citeseer},
Keywords = {Conditional Probability Estimation and Bayesian Learning and Lazy Bayesian Rules and Lazy Learning},
Location = {Bled, Slovenia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT LBR is a lazy semi-naive Bayesian classifier learning technique, designed to alleviate the attribute interdependence problem of naive Bayesian classification. To classify a test example, it creates a conjunctive rule that selects a most appropriate subset of training examples and induces a local naive Bayesian classifier using this subset. LBR can significantly improve the performance of the naive Bayesian classifier. A bias and variance analysis of LBR reveals that it significantly reduces the bias of naive Bayesian classification at a cost of a slight increase in variance. It is interesting to compare this lazy technique with boosting and bagging, two well-known state-of-the-art non-lazy learning techniques. Empirical comparison of LBR with boosting decision trees on discrete valued data shows that LBR has, on average, significantly lower variance and higher bias. As a result of the interaction of these effects, the average prediction error of LBR over a range of learning tasks is at a level directly comparable to boosting. LBR provides a very competitive discrete valued learning technique where error minimization is the primary concern. It is very efficient when a single classifier is to be applied to classify few cases, such as in a typical incremental learning scenario.

Zheng, Z., & Webb, G. I. (1999). Stochastic Attribute Selection Committees with Multiple Boosting: Learning More Accurate and More Stable Classifier Committees. Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third Pacific-Asia Conference (PAKDD'99), Berlin/Heidelberg, pp. 123-132.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb99b,
Title = {Stochastic Attribute Selection Committees with Multiple Boosting: Learning More Accurate and More Stable Classifier Committees},
Author = {Z. Zheng and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third {Pacific}-{Asia} Conference (PAKDD'99)},
Year = {1999},
Address = {Berlin/Heidelberg},
Editor = {N. Zhong and L. Zhou},
Pages = {123-132},
Publisher = {Springer-Verlag},
Abstract = {Classifier learning is a key technique for KDD. Approaches to learning classifier committees, including Boosting, Bagging, SASC, and SASCB, have demonstrated great success in increasing the prediction accuracy¡ curacy of decision trees. Boosting and Bagging create different classifiers by modifying the distribution of the training set. SASC adopts a different method. It generates committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. SASCB, a combination of Boosting and SASC, has shown the ability to further increase, on average, the prediction accuracy of decision trees. It has been found that the performance of SASCB and Boosting is more variable than that of SASC, although SASCB is more accurate than the others on average. In this paper, we present a novel method to reduce variability of SASCB and Boosting, and further increase their average accuracy. It generates multiple committees by incorporating Bagging into SASCB. As well as improving stability and average accuracy, the resulting method is amenable to parallel or distributed processing, while Boosting and SascB are not. This is an important characteristic for datamining in large datasets.},
Audit-trail = {http://link.springer.de/link/service/series/0558/bibs/1574/15740123.htm},
Keywords = {MultiBoosting and Boosting and Stochastic Attribute Selection committees},
Location = {Beijing, China},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Classifier learning is a key technique for KDD. Approaches to learning classifier committees, including Boosting, Bagging, SASC, and SASCB, have demonstrated great success in increasing the prediction accuracy¡ curacy of decision trees. Boosting and Bagging create different classifiers by modifying the distribution of the training set. SASC adopts a different method. It generates committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. SASCB, a combination of Boosting and SASC, has shown the ability to further increase, on average, the prediction accuracy of decision trees. It has been found that the performance of SASCB and Boosting is more variable than that of SASC, although SASCB is more accurate than the others on average. In this paper, we present a novel method to reduce variability of SASCB and Boosting, and further increase their average accuracy. It generates multiple committees by incorporating Bagging into SASCB. As well as improving stability and average accuracy, the resulting method is amenable to parallel or distributed processing, while Boosting and SascB are not. This is an important characteristic for datamining in large datasets.

Webb, G. I. (1999). Decision Tree Grafting From The All Tests But One Partition. Proceedings of the Sixteenth International Joint Conference on Artificial Intelligence (IJCAI 99), San Francisco, pp. 702-707.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb99,
Title = {Decision Tree Grafting From The All Tests But One Partition},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Sixteenth International Joint Conference on Artificial Intelligence ({IJCAI} 99)},
Year = {1999},
Address = {San Francisco},
Editor = {T. Dean},
Pages = {702-707},
Publisher = {Morgan Kaufmann},
Abstract = {Decision tree grafting adds nodes to an existing decision tree with the objective of reducing prediction error. A new grafting algorithm is presented that considers one set of training data only for each leaf of the initial decision tree, the set of cases that fail at most one test on the path to the leaf. This new technique is demonstrated to retain the error reduction power of the original grafting algorithm while dramatically reducing compute time and the complexity of the inferred tree. Bias/variance analysis reveal that the original grafting technique operated primarily by variance reduction while the new technique reduces both bias and variance.},
Audit-trail = {PDF posted with the permission of {IJCAI} Inc},
Keywords = {Decision Tree Learning and Decision Tree Grafting and Occams Razor},
Location = {Stockholm, Sweden},
Related = {decision-tree-grafting}
}
ABSTRACT Decision tree grafting adds nodes to an existing decision tree with the objective of reducing prediction error. A new grafting algorithm is presented that considers one set of training data only for each leaf of the initial decision tree, the set of cases that fail at most one test on the path to the leaf. This new technique is demonstrated to retain the error reduction power of the original grafting algorithm while dramatically reducing compute time and the complexity of the inferred tree. Bias/variance analysis reveal that the original grafting technique operated primarily by variance reduction while the new technique reduces both bias and variance.

Chiu, B. C., & Webb, G. I. (1999). Dual-Model: An Architecture for Utilizing Temporal Information in Student Modeling. Proceedings of the Seventh International Conference on Computers in Education (ICCE '99), Amsterdam, pp. 111-118.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ChiuWebb99a,
Title = {Dual-Model: An Architecture for Utilizing Temporal Information in Student Modeling},
Author = {B. C. Chiu and G. I. Webb},
Booktitle = {Proceedings of the Seventh International Conference on Computers in Education (ICCE '99)},
Year = {1999},
Address = {Amsterdam},
Editor = {G. Cumming and T. Okamoto and L. Gomez },
Pages = {111-118 },
Publisher = {IOS Press},
Volume = {1},
Abstract = {A modeling system may be required to predict an agent's future actions even when confronted by inadequate or contradictory relevant evidence from observations of past actions. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. This raises two issues. First, when maximizing prediction rate is preferable, what mechanisms can be employed such that a system can make more predictions without severely degrading prediction accuracy? Second, for contexts in which accuracy is of primary importance, how can we further improve prediction accuracy? A recently proposed Dual-model approach, which takes models' temporal characteristics into account, suggests a solution to the first problem, but leaves room for further improvement. This paper presents two classes of Dual-model variant. Each aims to achieve one of the above objectives. With the performance of the original system as a baseline, which does not utilize the temporal information, empirical evaluations in the domain of elementary subtraction show that one class of variant outperforms the baseline in prediction rate while the other does so in prediction accuracy, without significantly affecting other overall measures of the original performance. Keywords: Agent modeling, Student modeling, Temporal model, Decision tree.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and User Modeling},
Location = {Chiba, Japan.(Also appeared in the Proceedings of ACAI Workshop W03: Machine Learning in User Modeling, pp 46-53)},
Related = {feature-based-modeling}
}
ABSTRACT A modeling system may be required to predict an agent's future actions even when confronted by inadequate or contradictory relevant evidence from observations of past actions. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. This raises two issues. First, when maximizing prediction rate is preferable, what mechanisms can be employed such that a system can make more predictions without severely degrading prediction accuracy? Second, for contexts in which accuracy is of primary importance, how can we further improve prediction accuracy? A recently proposed Dual-model approach, which takes models' temporal characteristics into account, suggests a solution to the first problem, but leaves room for further improvement. This paper presents two classes of Dual-model variant. Each aims to achieve one of the above objectives. With the performance of the original system as a baseline, which does not utilize the temporal information, empirical evaluations in the domain of elementary subtraction show that one class of variant outperforms the baseline in prediction rate while the other does so in prediction accuracy, without significantly affecting other overall measures of the original performance. Keywords: Agent modeling, Student modeling, Temporal model, Decision tree.

Ting, K. M., Z. Zheng, & Webb, G. I. (1999). Learning Lazy Rules to Improve the Performance of Classifiers. Proceedings of the Nineteenth SGES International Conference on Knowledge Based Systems and Applied Artificial Intelligence (ES'99), New York, pp. 122-131.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{TingZhengWebb99,
Title = {Learning Lazy Rules to Improve the Performance of Classifiers},
Author = {K.M. Ting and Z. Zheng, \& G. I. Webb },
Booktitle = {Proceedings of the Nineteenth SGES International Conference on Knowledge Based Systems and Applied Artificial Intelligence (ES'99)},
Year = {1999},
Address = {New York},
Editor = {F. Coenen and A. Macintosh},
Pages = {122-131},
Publisher = {Springer},
Abstract = {Based on an earlier study on lazy Bayesian rule learning, this paper introduces a general lazy learning framework, called LAZYRULE, that begins to learn a rule only when classifying a test case. The objective of the framework is to improve the performance of a base learning algorithm. It has the potential to be used for different types of base learning algorithms. LAZYRULE performs attribute elimination and training case selection using cross-validation to generate the most appropriate rule for each test case. At the consequent of the rule, it applies the base learning algorithm on the selected training subset and the remaining attributes to construct a classifier to make a prediction. This combined action seeks to build a better performing classifier for each test case than the classifier trained using all attributes and all training cases. We show empirically that LAZYRULE improves the performances of naive Bayesian classifiers and majority vote.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation},
Location = {Peterhouse College, Cambridge, UK},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Based on an earlier study on lazy Bayesian rule learning, this paper introduces a general lazy learning framework, called LAZYRULE, that begins to learn a rule only when classifying a test case. The objective of the framework is to improve the performance of a base learning algorithm. It has the potential to be used for different types of base learning algorithms. LAZYRULE performs attribute elimination and training case selection using cross-validation to generate the most appropriate rule for each test case. At the consequent of the rule, it applies the base learning algorithm on the selected training subset and the remaining attributes to construct a classifier to make a prediction. This combined action seeks to build a better performing classifier for each test case than the classifier trained using all attributes and all training cases. We show empirically that LAZYRULE improves the performances of naive Bayesian classifiers and majority vote.

Brain, D., & Webb, G. I. (1999). On The Effect of Data Set Size on Bias And Variance in Classification Learning. Proceedings of the Fourth Australian Knowledge Acquisition Workshop (AKAW-99), Sydney, pp. 117-128.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{BrainWebb99,
Title = {On The Effect of Data Set Size on Bias And Variance in Classification Learning},
Author = {D. Brain and G. I. Webb},
Booktitle = {Proceedings of the Fourth {Australian} Knowledge Acquisition Workshop ({AKAW}-99)},
Year = {1999},
Address = {Sydney},
Editor = {D. Richards and G. Beydoun and A. Hoffmann and P. Compton },
Pages = {117-128},
Publisher = {The University of New South Wales},
Abstract = {With the advent of data mining, machine learning has come of age and is now a critical technology in many businesses. However, machine learning evolved in a different research context to that in which it now finds itself employed. A particularly important problem in the data mining world is working effectively with large data sets. However, most machine learning research has been conducted in the context of learning from very small data sets. To date most approaches to scaling up machine learning to large data sets have attempted to modify existing algorithms to deal with large data sets in a more computationally efficient and effective manner. But is this necessarily the best method? This paper explores the possibility of designing algorithms specifically for large data sets. Specifically, the paper looks at how increasing data set size affects bias and variance error decompositions for classification algorithms. Preliminary results of experiments to determine these effects are presented, showing that, as hypothesized variance can be expected to decrease as training set size increases. No clear effect of training set size on bias was observed. These results have profound implications for data mining from large data sets, indicating that developing effective learning algorithms for large data sets is not simply a matter of finding computationally efficient variants of existing learning algorithms.},
Audit-trail = {*},
Keywords = {Learning from large datasets and Bias-Variance},
Location = {Sydney, Australia},
Related = {learning-from-large-datasets}
}
ABSTRACT With the advent of data mining, machine learning has come of age and is now a critical technology in many businesses. However, machine learning evolved in a different research context to that in which it now finds itself employed. A particularly important problem in the data mining world is working effectively with large data sets. However, most machine learning research has been conducted in the context of learning from very small data sets. To date most approaches to scaling up machine learning to large data sets have attempted to modify existing algorithms to deal with large data sets in a more computationally efficient and effective manner. But is this necessarily the best method? This paper explores the possibility of designing algorithms specifically for large data sets. Specifically, the paper looks at how increasing data set size affects bias and variance error decompositions for classification algorithms. Preliminary results of experiments to determine these effects are presented, showing that, as hypothesized variance can be expected to decrease as training set size increases. No clear effect of training set size on bias was observed. These results have profound implications for data mining from large data sets, indicating that developing effective learning algorithms for large data sets is not simply a matter of finding computationally efficient variants of existing learning algorithms.

Newlands, D., & Webb, G. I. (1999). Convex Hulls in Concept Induction. Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third Pacific-Asia Conference (PAKDD'99), Berlin/Heidelberg, pp. 306-316.
[URL] [Bibtex] [Abstract]

@InProceedings{NewlandsWebb99a,
Title = {Convex Hulls in Concept Induction},
Author = {D. Newlands and G. I. Webb},
Booktitle = {Lecture Notes in Computer Science 1574: Methodologies for Knowledge Discovery and Data Mining - Proceedings of the Third {Pacific}-{Asia} Conference (PAKDD'99)},
Year = {1999},
Address = {Berlin/Heidelberg},
Editor = {N. Zhong and L. Zhou},
Pages = {306-316},
Publisher = {Springer-Verlag},
Abstract = {This paper investigates modelling concepts as a few, large convex hulls rather than as many, small, axis-orthogonal divisions as is done by systems which currently dominate classification learning. It is argued that this approach produces classifiers which have less strong hypothesis language bias and which, because of the fewness of the concepts induced, are more understandable. The design of such a system is described and its performance is investigated. Convex hulls are shown to be a useful inductive generalisation technique offering rather different biases than well-known systems such as C4.5 and CN2. The types of domains where convex hulls can be usefully employed are described.},
Audit-trail = {http://link.springer.de/link/service/series/0558/bibs/1574/15740306.htm},
Location = {Beijing, China},
Url = {http://link.springer.com/chapter/10.1007/3-540-48912-6_42}
}
ABSTRACT This paper investigates modelling concepts as a few, large convex hulls rather than as many, small, axis-orthogonal divisions as is done by systems which currently dominate classification learning. It is argued that this approach produces classifiers which have less strong hypothesis language bias and which, because of the fewness of the concepts induced, are more understandable. The design of such a system is described and its performance is investigated. Convex hulls are shown to be a useful inductive generalisation technique offering rather different biases than well-known systems such as C4.5 and CN2. The types of domains where convex hulls can be usefully employed are described.

Chiu, B. C., & Webb, G. I. (1998). Using Decision Trees For Agent Modelling: Improving Prediction Performance. User Modeling and User-Adapted Interaction, 8(1-2), 131-152.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@Article{ChiuWebb98,
Title = {Using Decision Trees For Agent Modelling: Improving Prediction Performance},
Author = {B. C. Chiu and G. I. Webb},
Journal = {User Modeling and User-Adapted Interaction},
Year = {1998},
Number = {1-2},
Pages = {131-152},
Volume = {8},
Abstract = {A modeling system may be required to predict an agentæs future actions under constraints of inadequate or contradictory relevant historical evidence. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. A previous study that explored techniques for improving prediction rates in the context of modeling studentsæ subtraction skills using Feature Based Modeling showed a tradeoff between prediction rate and predication accuracy. This paper presents research that aims to improve prediction rates without affecting prediction accuracy. The FBM-C4.5 agent modeling system was used in this research. However, the techniques explored are applicable to any Feature Based Modeling system, and the most effective technique developed is applicable to most agent modeling systems. The default FBM-C4.5 system models agentsæ competencies with a set of decision trees, trained on all historical data. Each tree predicts one particular aspect of the agentæs action. Predictions from multiple trees are compared for consensus. FBM-C4.5 makes no prediction when predictions from different trees contradict one another. This strategy trades off reduced prediction rates for increased accuracy. To make predictions in the absence of consensus, three techniques have been evaluated. They include using voting, using a tree quality measure and using a leaf quality measure. An alternative technique that merges multiple decision trees into a single tree provides an advantage of producing models that are more comprehensible. However, all of these techniques demonstrated the previous encountered trade-off between rate of prediction and accuracy of prediction, albeit less pronounced. It was hypothesized that models built on more current observations would outperform models built on earlier observations. Experimental results support this hypothesis. A Dual-model system, which takes this temporal factor into account, has been evaluated. This fifth approach achieved a significant improvement in prediction rate without significantly affecting prediction accuracy.},
Address = {Netherlands},
Audit-trail = {Link via {ACM} Portal},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {Springer},
Related = {feature-based-modeling}
}
ABSTRACT A modeling system may be required to predict an agentæs future actions under constraints of inadequate or contradictory relevant historical evidence. This can result in low prediction accuracy, or otherwise, low prediction rates, leaving a set of cases for which no predictions are made. A previous study that explored techniques for improving prediction rates in the context of modeling studentsæ subtraction skills using Feature Based Modeling showed a tradeoff between prediction rate and predication accuracy. This paper presents research that aims to improve prediction rates without affecting prediction accuracy. The FBM-C4.5 agent modeling system was used in this research. However, the techniques explored are applicable to any Feature Based Modeling system, and the most effective technique developed is applicable to most agent modeling systems. The default FBM-C4.5 system models agentsæ competencies with a set of decision trees, trained on all historical data. Each tree predicts one particular aspect of the agentæs action. Predictions from multiple trees are compared for consensus. FBM-C4.5 makes no prediction when predictions from different trees contradict one another. This strategy trades off reduced prediction rates for increased accuracy. To make predictions in the absence of consensus, three techniques have been evaluated. They include using voting, using a tree quality measure and using a leaf quality measure. An alternative technique that merges multiple decision trees into a single tree provides an advantage of producing models that are more comprehensible. However, all of these techniques demonstrated the previous encountered trade-off between rate of prediction and accuracy of prediction, albeit less pronounced. It was hypothesized that models built on more current observations would outperform models built on earlier observations. Experimental results support this hypothesis. A Dual-model system, which takes this temporal factor into account, has been evaluated. This fifth approach achieved a significant improvement in prediction rate without significantly affecting prediction accuracy.

Zheng, Z., & Webb, G. I. (1998). Stochastic Attribute Selection Committees. Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 321-332.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb98a,
Title = {Stochastic Attribute Selection Committees},
Author = {Z. Zheng and G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
Year = {1998},
Address = {Berlin},
Editor = {G. Antoniou and J.K. Slaney},
Pages = {321-332},
Publisher = {Springer-Verlag},
Abstract = {Classifier committee learning methods generate multiple classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Two such methods, Bagging and Boosting, have shown great success with decision tree learning. They create different classifiers by modifying the distribution of the training set. This paper studies a different approach: Stochastic Attribute Selection Committee learning of decision trees. It generates classifier committees by stochastically modifying the set of attributes but keeping the distribution of the training set unchanged. An empirical evaluation of a variant of this method, namely Sasc, in a representative collection of natural domains shows that the SASC method can significantly reduce the error rate of decision tree learning. On average Sasc is more accurate than Bagging and less accurate than Boosting, although a one-tailed sign¡test fails to show that these differences are significant at a level of 0.05. In addition, it is found that, like Bagging, Sasc is more stable than Boosting in terms of less frequently obtaining significantly higher error rates than C4.5 and, when error is raised, producing lower error rate increases. Moreover, like Bagging, Sasc is amenable to parallel and distributed processing while Boosting is not.},
Audit-trail = {*},
Keywords = {MultiBoosting and Stochastic Attribute Selection Committees},
Location = {Brisbane, Australia},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Classifier committee learning methods generate multiple classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Two such methods, Bagging and Boosting, have shown great success with decision tree learning. They create different classifiers by modifying the distribution of the training set. This paper studies a different approach: Stochastic Attribute Selection Committee learning of decision trees. It generates classifier committees by stochastically modifying the set of attributes but keeping the distribution of the training set unchanged. An empirical evaluation of a variant of this method, namely Sasc, in a representative collection of natural domains shows that the SASC method can significantly reduce the error rate of decision tree learning. On average Sasc is more accurate than Bagging and less accurate than Boosting, although a one-tailed sign¡test fails to show that these differences are significant at a level of 0.05. In addition, it is found that, like Bagging, Sasc is more stable than Boosting in terms of less frequently obtaining significantly higher error rates than C4.5 and, when error is raised, producing lower error rate increases. Moreover, like Bagging, Sasc is amenable to parallel and distributed processing while Boosting is not.

Webb, G. I. (1998). The Problem of Missing Values in Decision Tree Grafting. Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 273-283.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb98,
Title = {The Problem of Missing Values in Decision Tree Grafting},
Author = {G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
Year = {1998},
Address = {Berlin},
Editor = {G. Antoniou and J.K. Slaney},
Pages = {273-283},
Publisher = {Springer-Verlag},
Abstract = {Decision tree grafting adds nodes to inferred decision trees. Previous research has demonstrated that appropriate grafting techniques can improve predictive accuracy across a wide cross¡selection of domains. However, previous decision tree grafting systems are demonstrated to have a serious deficiency for some data sets containing missing values. This problem arises due to the method for handling missing values employed by C4.5, in which the grafting systems have been embedded. This paper provides an explanation of and solution to the problem. Experimental evidence is presented of the efficacy of this solution.},
Audit-trail = {*},
Keywords = {Decision Tree Learning and Decision Tree Grafting and Occams Razor},
Location = {Brisbane, Australia},
Related = {decision-tree-grafting}
}
ABSTRACT Decision tree grafting adds nodes to inferred decision trees. Previous research has demonstrated that appropriate grafting techniques can improve predictive accuracy across a wide cross¡selection of domains. However, previous decision tree grafting systems are demonstrated to have a serious deficiency for some data sets containing missing values. This problem arises due to the method for handling missing values employed by C4.5, in which the grafting systems have been embedded. This paper provides an explanation of and solution to the problem. Experimental evidence is presented of the efficacy of this solution.

Webb, G. I. (1998). Preface to UMUAI Special Issue on Machine Learning for User Modeling. User Modeling and User-Adapted Interaction., 8(1), 1-3, Springer (Netherlands).
[PDF] [Bibtex]

@Article{Webb98a,
Title = {Preface to UMUAI Special Issue on Machine Learning for User Modeling},
Author = {G. I. Webb},
Journal = {User Modeling and User-Adapted Interaction.},
Year = {1998},
Number = {1},
Pages = {1-3, Springer (Netherlands)},
Volume = {8},
Address = {Netherlands},
Audit-trail = {Link via Kluwer site},
Keywords = {User Modeling},
Publisher = {Kluwer Academic Publishers}
}
ABSTRACT 

Webb, G. I., & Kuzmycz, M. (1998). Evaluation Of Data Aging: A Technique For Discounting Old Data During Student Modeling. Lecture Notes in Computer Science Vol. 1452: Proceedings of the Fourth International Conference on Intelligent Tutoring Systems (ITS '98), Berlin, pp. 384-393.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbKuzmycz98,
Title = {Evaluation Of Data Aging: A Technique For Discounting Old Data During Student Modeling},
Author = {G.I. Webb and M. Kuzmycz},
Booktitle = {Lecture Notes in Computer Science Vol. 1452: Proceedings of the Fourth International Conference on Intelligent Tutoring Systems (ITS '98)},
Year = {1998},
Address = {Berlin},
Editor = {B.P. Goettl and H. M. Halff and C. Redfield and V. Shute },
Pages = {384-393},
Publisher = {Springer-Verlag},
Abstract = {Student modeling systems must operate in an environment in which a student's mastery of a subject matter is likely to change as a lesson progresses. A student model is formed from evaluation of evidence about the student's mastery of the domain. However, given that such mastery will change, older evidence is likely to be less valuable than recent evidence. Data aging addresses this issue by discounting the value of older evidence. This paper provides experimental evaluation of the effects of data aging. While it is demonstrated that data aging can result in statistically significant increases in both the number and accuracy of predictions that a modeling system makes, it is also demonstrated that the reverse can be true. Further, the effects experienced are of only small magnitude. It is argued that these results demonstrate some potential for data aging as a general strategy, but do not warrant employing data aging in its current form.},
Audit-trail = {PDF posted},
Keywords = {Feature Based Modeling and User Modeling},
Location = {San Antonio, Texas},
Related = {feature-based-modeling}
}
ABSTRACT Student modeling systems must operate in an environment in which a student's mastery of a subject matter is likely to change as a lesson progresses. A student model is formed from evaluation of evidence about the student's mastery of the domain. However, given that such mastery will change, older evidence is likely to be less valuable than recent evidence. Data aging addresses this issue by discounting the value of older evidence. This paper provides experimental evaluation of the effects of data aging. While it is demonstrated that data aging can result in statistically significant increases in both the number and accuracy of predictions that a modeling system makes, it is also demonstrated that the reverse can be true. Further, the effects experienced are of only small magnitude. It is argued that these results demonstrate some potential for data aging as a general strategy, but do not warrant employing data aging in its current form.

Zheng, Z., Webb, G. I., & Ting, K. M. (1998). Integrating Boosting and Stochastic Attribute Selection Committees for Further Improving The Performance of Decision Tree Learning. Proceedings of the Tenth IEEE International Conference on Tools with Artificial Intelligence (ICTAI-98), Los Alamitos, CA, pp. 216-223.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebbTing98,
Title = {Integrating Boosting and Stochastic Attribute Selection Committees for Further Improving The Performance of Decision Tree Learning},
Author = {Z. Zheng and G. I. Webb and K. M. Ting},
Booktitle = {Proceedings of the Tenth {IEEE} International Conference on Tools with Artificial Intelligence (ICTAI-98)},
Year = {1998},
Address = {Los Alamitos, CA},
Pages = {216-223},
Publisher = {{IEEE} Computer Society Press},
Abstract = {Techniques for constructing classifier committees including boosting and bagging have demonstrated great success, especially boosting for decision tree learning. This type of technique generates several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Boosting and bagging create different classifiers by modifying the distribution of the training set. SASC (Stochastic Attribute Selection Committees) uses an alternative approach to generating classifier committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. We propose a method for improving the performance of boosting. This technique combines boosting and SASC. It builds classifier committees by manipulating both the distribution of the training set and the set of attributes available during induction. In the synergy SASC effectively increases the model diversity of boosting. Experiments with a representative collection of natural domains show that, on average, the combined technique outperforms either boosting or SASC alone in terms of reducing the error rate of decision tree learning.},
Audit-trail = {Available via Citeseer http://citeseer.ist.psu.edu/4952.html},
Keywords = {MultiBoosting and Boosting and Stochastic Attribute Selection Committees},
Location = {Taipei, Taiwan},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Techniques for constructing classifier committees including boosting and bagging have demonstrated great success, especially boosting for decision tree learning. This type of technique generates several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. Boosting and bagging create different classifiers by modifying the distribution of the training set. SASC (Stochastic Attribute Selection Committees) uses an alternative approach to generating classifier committees by stochastic manipulation of the set of attributes considered at each node during tree induction, but keeping the distribution of the training set unchanged. We propose a method for improving the performance of boosting. This technique combines boosting and SASC. It builds classifier committees by manipulating both the distribution of the training set and the set of attributes available during induction. In the synergy SASC effectively increases the model diversity of boosting. Experiments with a representative collection of natural domains show that, on average, the combined technique outperforms either boosting or SASC alone in terms of reducing the error rate of decision tree learning.

Zheng, Z., & Webb, G. I. (1998). Multiple Boosting: A Combination of Boosting and Bagging. Proceedings of the 1998 International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'98), pp. 1133-1140.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ZhengWebb98b,
Title = {Multiple Boosting: A Combination of Boosting and Bagging},
Author = {Z. Zheng and G. I. Webb},
Booktitle = {Proceedings of the 1998 International Conference on Parallel and Distributed Processing Techniques and Applications (PDPTA'98)},
Year = {1998},
Pages = {1133-1140},
Publisher = {CSREA Press},
Abstract = {Classifier committee learning approaches have demonstrated great success in increasing the prediction accuracy of classifier learning, which is a key technique for datamining. These approaches generate several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. It has been shown that Boosting and Bagging, as two representative methods of this type, can significantly decrease the error rate of decision tree learning. Boosting is generally more accurate than Bagging, but the former is more variable than the latter. In addition, bagging is amenable to parallel or distributed processing, while Boosting is not. In this paper, we study a new committee learning algorithm, namely MB (Multiple Boosting). It creates multiple subcommittees by combining Boosting and Bagging. Experimental results in a representative collection of natural domains show that MB is, on average, more accurate than either Bagging or Boosting alone. It is more stable than Boosting, and is amenable to parallel or distributed processing. These characters¡ characteristics make MB a good choice for parallel datamining¡ ing.},
Audit-trail = {*},
Keywords = {MultiBoosting},
Location = {Las Vegas, Nevada},
Related = {multiboosting-and-multi-strategy-ensemble-learning}
}
ABSTRACT Classifier committee learning approaches have demonstrated great success in increasing the prediction accuracy of classifier learning, which is a key technique for datamining. These approaches generate several classifiers to form a committee by repeated application of a single base learning algorithm. The committee members vote to decide the final classification. It has been shown that Boosting and Bagging, as two representative methods of this type, can significantly decrease the error rate of decision tree learning. Boosting is generally more accurate than Bagging, but the former is more variable than the latter. In addition, bagging is amenable to parallel or distributed processing, while Boosting is not. In this paper, we study a new committee learning algorithm, namely MB (Multiple Boosting). It creates multiple subcommittees by combining Boosting and Bagging. Experimental results in a representative collection of natural domains show that MB is, on average, more accurate than either Bagging or Boosting alone. It is more stable than Boosting, and is amenable to parallel or distributed processing. These characters¡ characteristics make MB a good choice for parallel datamining¡ ing.

Viswanathan, M., & Webb, G. I. (1998). Classification Learning Using All Rules. Lecture Notes in Computer Science 1398: Proceedings of the Tenth European Conference on Machine Learning (ECML'98), Berlin/Heidelberg, pp. 149-159.
[PDF] [Bibtex] [Abstract]

@InProceedings{ViswanathanWebb98,
Title = {Classification Learning Using All Rules},
Author = {M. Viswanathan and G.I. Webb},
Booktitle = {Lecture Notes in Computer Science 1398: Proceedings of the Tenth European Conference on Machine Learning (ECML'98)},
Year = {1998},
Address = {Berlin/Heidelberg},
Editor = {C. Nedellec and C. Rouveiro},
Pages = {149-159},
Publisher = {Springer},
Abstract = {The covering algorithm has been ubiquitous in the induction of classification rules. This approach to machine learning uses heuristic search that seeks to find a minimum number of rules that adequately explains the data. However, recent research has provided evidence that learning redundant classifiers can increase predictive accuracy. Learning all possible classifiers seems to be a plausible form of this nomination of redundant classifiers. This paper presents an algorithm that in effect learns all classifiers. Preliminary investigations by Webb (1996b) suggest that a heuristic covering algorithm in general learns classification rules with higher predictive accuracy than those learned by this new approach. In this paper we present an extensive empirical comparison between the learning-all-rules algorithm and three varied established approaches to inductive learning, namely a covering algorithm, an instance-based learner and a decision tree learner. Empirical evaluation provides strong evidence in support of learning-all-rules as a plausible approach to inductive learning.},
Audit-trail = {Springerlink not up for this volume yet.},
Keywords = {Lazy Learning and Rule Learning},
Location = {Chemnitz, Germany}
}
ABSTRACT The covering algorithm has been ubiquitous in the induction of classification rules. This approach to machine learning uses heuristic search that seeks to find a minimum number of rules that adequately explains the data. However, recent research has provided evidence that learning redundant classifiers can increase predictive accuracy. Learning all possible classifiers seems to be a plausible form of this nomination of redundant classifiers. This paper presents an algorithm that in effect learns all classifiers. Preliminary investigations by Webb (1996b) suggest that a heuristic covering algorithm in general learns classification rules with higher predictive accuracy than those learned by this new approach. In this paper we present an extensive empirical comparison between the learning-all-rules algorithm and three varied established approaches to inductive learning, namely a covering algorithm, an instance-based learner and a decision tree learner. Empirical evaluation provides strong evidence in support of learning-all-rules as a plausible approach to inductive learning.

Webb, G. I., & Pazzani, M. (1998). Adjusted Probability Naive Bayesian Induction. Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98), Berlin, pp. 285-295.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbPazzani98,
Title = {Adjusted Probability Naive Bayesian Induction},
Author = {G. I. Webb and M. Pazzani},
Booktitle = {Lecture Notes in Computer Science Vol. 1502: Advanced Topics in Artificial Intelligence, Selected Papers from the Eleventh Australian Joint Conference on Artificial Intelligence (AI '98)},
Year = {1998},
Address = {Berlin},
Editor = {G. Antoniou and J.K. Slaney},
Pages = {285-295},
Publisher = {Springer-Verlag},
Abstract = {Naive Bayesian classifiers utilise a simple mathematical model for induction. While it is known that the assumptions on which this model is based are frequently violated, the predictive accuracy obtained in discriminate classification tasks is surprisingly competitive in comparison to more complex induction techniques. Adjusted probability naive Bayesian induction adds a simple extension to the naive Bayesian classifier. A numeric weight is inferred for each class. During discriminate classification, the naive Bayesian probability of a class is multiplied by its weight to obtain an adjusted value. The use of this adjusted value in place of the naive Bayesian probability is shown to significantly improve predictive accuracy.},
Audit-trail = {*},
Keywords = {Conditional Probability Estimation and Bayesian Learning},
Location = {Brisbane, Australia},
Related = {learning-complex-conditional-probabilities-from-data}
}
ABSTRACT Naive Bayesian classifiers utilise a simple mathematical model for induction. While it is known that the assumptions on which this model is based are frequently violated, the predictive accuracy obtained in discriminate classification tasks is surprisingly competitive in comparison to more complex induction techniques. Adjusted probability naive Bayesian induction adds a simple extension to the naive Bayesian classifier. A numeric weight is inferred for each class. During discriminate classification, the naive Bayesian probability of a class is multiplied by its weight to obtain an adjusted value. The use of this adjusted value in place of the naive Bayesian probability is shown to significantly improve predictive accuracy.

Smith, P. A., & Webb, G. I. (1998). Overview of a Low-Level Program Visualisation Tool for Novice Programmers. Proceedings of the Sixth International Conference on Computers in Education (ICCE '98), Berlin, pp. 213-216.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{SmithWebb98,
Title = {Overview of a Low-Level Program Visualisation Tool for Novice Programmers},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {Proceedings of the Sixth International Conference on Computers in Education (ICCE '98)},
Year = {1998},
Address = {Berlin},
Pages = {213-216},
Publisher = {Springer-Verlag},
Abstract = {As a programming novice attempts to attain expertise in programming she must develop adequate mental models and knowledge structures of the programming process. Unfortunately, many of the computerised tools to which novice programmers have access are designed by expert programmers for experts and as such do not meet the needs of novices. Low-level program visualisation tools make explicit the internal workings of program execution and as such can serve as conceptual models onto which novices can assimilate information about programming. This paper discusses the need for such a tool, what features such a tool may include and gives a brief description of an evaluation of a low-level program visualisation tool developed at Deakin University.},
Audit-trail = {Reconstructed paper posted Nov 05},
Keywords = {Program Visualisation},
Location = {Beijing},
Related = {program-visualisation}
}
ABSTRACT As a programming novice attempts to attain expertise in programming she must develop adequate mental models and knowledge structures of the programming process. Unfortunately, many of the computerised tools to which novice programmers have access are designed by expert programmers for experts and as such do not meet the needs of novices. Low-level program visualisation tools make explicit the internal workings of program execution and as such can serve as conceptual models onto which novices can assimilate information about programming. This paper discusses the need for such a tool, what features such a tool may include and gives a brief description of an evaluation of a low-level program visualisation tool developed at Deakin University.

Webb, G. I. (1997). Decision Tree Grafting. Proceedings of the Fifteenth International Joint Conference on Artificial Intelligence (IJCAI 97), San Francisco, pp. 846-851.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb97,
Title = {Decision Tree Grafting},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Fifteenth International Joint Conference on Artificial Intelligence ({IJCAI} 97)},
Year = {1997},
Address = {San Francisco},
Pages = {846-851},
Publisher = {Morgan Kaufmann},
Abstract = {This paper extends recent work on decision tree grafting. Grafting is an inductive process that adds nodes to inferred decision trees. This process is demonstrated to frequently improve predictive accuracy. Superficial analysis might suggest that decision tree grafting is the direct reverse of pruning. To the contrary, it is argued that the two processes are complementary. This is because, like standard tree growing techniques, pruning uses only local information, whereas grafting uses non-local information. The use of both pruning and grafting in conjunction is demonstrated to provide the best general predictive accuracy over a representative selection of learning tasks.},
Audit-trail = {PDF posted with the permission of IJCAI Inc},
Keywords = {Decision Trees and Decision Tree Grafting and Occams Razor},
Location = {Nagoya, Japan},
Related = {decision-tree-grafting}
}
ABSTRACT This paper extends recent work on decision tree grafting. Grafting is an inductive process that adds nodes to inferred decision trees. This process is demonstrated to frequently improve predictive accuracy. Superficial analysis might suggest that decision tree grafting is the direct reverse of pruning. To the contrary, it is argued that the two processes are complementary. This is because, like standard tree growing techniques, pruning uses only local information, whereas grafting uses non-local information. The use of both pruning and grafting in conjunction is demonstrated to provide the best general predictive accuracy over a representative selection of learning tasks.

Chiu, B. C., & Webb, G. I. (1997). Using C4.5 as an Induction Engine for Agent Modeling: An Experiment of Optimisation. Proceedings (on-line) of The First Machine Learning for User Modeling Workshop (UM'97).
[PDF] [DOI] [Bibtex]  → Related papers and software

@InProceedings{ChiuWebb97,
Title = {Using C4.5 as an Induction Engine for Agent Modeling: An Experiment of Optimisation},
Author = {B. C. Chiu and G.I. Webb},
Booktitle = {Proceedings (on-line) of The First Machine Learning for User Modeling Workshop (UM'97)},
Year = {1997},
Doi = {10.1023/A:1008296930163},
Keywords = {Feature Based Modeling and User Modeling},
Location = {Chia Laguna, Sardinia},
Related = {feature-based-modeling}
}
ABSTRACT 

Chiu, B. C., Webb, G. I., & Kuzmycz, M. (1997). A Comparison of First-Order and Zeroth-Order Induction for Input-Output Agent Modelling. Proceedings of the Sixth International Conference on User Modeling (UM'97), New York/Vienna, pp. 347-358.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ChiuWebbKuzmycz97,
Title = {A Comparison of First-Order and Zeroth-Order Induction for Input-Output Agent Modelling},
Author = {B.C. Chiu and G.I. Webb and M. Kuzmycz},
Booktitle = {Proceedings of the Sixth International Conference on User Modeling (UM'97)},
Year = {1997},
Address = {New York/Vienna},
Editor = {A. Jameson and C. Paris and C. Tasso},
Pages = {347-358},
Publisher = {Springer},
Abstract = {Most student modelling systems seek to develop a model of the internal operation of the cognitive system. In contrast, Input-Output Agent Modelling (IOAM) models an agent in terms of relationships between the inputs and outputs of the cognitive system. Previous IOAM systems have demonstrated high predictive accuracy in the domain of elementary subtraction. These systems use zeroth-order induction. Many of the predicates used, however, represent relations. This suggests that first-order induction might perform well in this domain. This paper reports a study in which zeroth-order and first-order induction engines were used to build models of student subtraction skills. Comparative evaluation shows that zeroth-order induction performs better than first-order in detecting regularities indicating misconceptions while first-order induction leads zeroth-order in detecting regularities indicating correct concepts and inducing a more comprehensible student model. This suggests there exists a trade-off between these factors and that there is still scope for improvement.},
Audit-trail = {*},
Doi = {10.1007/978-3-7091-2670-7_35},
Keywords = {Feature Based Modeling and User Modeling},
Location = {Chia Laguna, Sardinia},
Related = {feature-based-modeling}
}
ABSTRACT Most student modelling systems seek to develop a model of the internal operation of the cognitive system. In contrast, Input-Output Agent Modelling (IOAM) models an agent in terms of relationships between the inputs and outputs of the cognitive system. Previous IOAM systems have demonstrated high predictive accuracy in the domain of elementary subtraction. These systems use zeroth-order induction. Many of the predicates used, however, represent relations. This suggests that first-order induction might perform well in this domain. This paper reports a study in which zeroth-order and first-order induction engines were used to build models of student subtraction skills. Comparative evaluation shows that zeroth-order induction performs better than first-order in detecting regularities indicating misconceptions while first-order induction leads zeroth-order in detecting regularities indicating correct concepts and inducing a more comprehensible student model. This suggests there exists a trade-off between these factors and that there is still scope for improvement.

Webb, G. I., Chiu, B. C., & Kuzmycz, M. (1997). Comparative Evaluation of Alternative Induction Engines for Feature Based Modelling. International Journal of Artificial Intelligence in Education, 8, 97-115.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbChiuKuzmycz97,
Title = {Comparative Evaluation of Alternative Induction Engines for Feature Based Modelling},
Author = {G. I. Webb and B. C. Chiu and M. Kuzmycz},
Journal = {International Journal of Artificial Intelligence in Education},
Year = {1997},
Pages = {97-115},
Volume = {8},
Abstract = {Feature Based Modelling has demonstrated the ability to produce agent models with high accuracy in predicting an agent's future actions. There are a number of respects in which this modelling technique is novel. However, there has been no previous analysis of which aspects of the approach are responsible for its performance. One distinctive feature of the approach is a purpose built induction module. This paper presents a study in which the original custom built Feature Based Modelling induction module was replaced by the C4.5 machine learning system. Comparative evaluation shows that the use of C4.5 increases the number of predictions made without significantly altering the accuracy of those predictions. This suggests that it is the general input-output agent modelling methodology used with both systems that has primary responsibility for the high predictive accuracy previously reported for Feature Based Modelling, rather than its initial idiosyncratic induction technique.},
Address = {NAmsterdam},
Audit-trail = {Link via IJAIED site},
Doi = {10.1.1.36.3545},
Keywords = {Feature Based Modeling and User Modeling},
Publisher = {IOS Press},
Related = {feature-based-modeling}
}
ABSTRACT Feature Based Modelling has demonstrated the ability to produce agent models with high accuracy in predicting an agent's future actions. There are a number of respects in which this modelling technique is novel. However, there has been no previous analysis of which aspects of the approach are responsible for its performance. One distinctive feature of the approach is a purpose built induction module. This paper presents a study in which the original custom built Feature Based Modelling induction module was replaced by the C4.5 machine learning system. Comparative evaluation shows that the use of C4.5 increases the number of predictions made without significantly altering the accuracy of those predictions. This suggests that it is the general input-output agent modelling methodology used with both systems that has primary responsibility for the high predictive accuracy previously reported for Feature Based Modelling, rather than its initial idiosyncratic induction technique.

Chiu, B. C., Webb, G. I., & Zheng, Z. (1997). Using Decision Trees for Agent Modelling: A Study on Resolving Conflicting Predictions. Lecture Notes in Computer Science Vol. 1342: Proceedings of the Tenth Australian Joint Conference on Artificial Intelligence (AI'97), Berlin, pp. 349-358.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{ChiuWebbZheng97,
Title = {Using Decision Trees for Agent Modelling: A Study on Resolving Conflicting Predictions},
Author = {B. C. Chiu and G. I. Webb and Z. Zheng},
Booktitle = {Lecture Notes in Computer Science Vol. 1342: Proceedings of the Tenth Australian Joint Conference on Artificial Intelligence (AI'97)},
Year = {1997},
Address = {Berlin},
Editor = {A. Sattar},
Pages = {349-358},
Publisher = {Springer-Verlag},
Abstract = {Input-Output Agent Modelling (IOAM) is an approach to modelling an agent in terms of relationships between the inputs and outputs of the cognitive system. This approach, together with a leading inductive learning algorithm, C4.5, has been adopted to build a subtraction skill modeller, C4.5-IOAM. It models agents' competencies with a set of decision trees. C4.5-IOAM makes no prediction when predictions from different decision trees are contradictory. This paper proposes three techniques for resolving such situations. Two techniques involve selecting the more reliable prediction from a set of competing predictions using a free quality measure and a leaf quality measure. The other technique merges multiple decision trees into a single tree. This has the additional advantage of producing more comprehensible models. Experimental results, in the domain of modelling elementary subtraction skills, showed that the tree quality and the leaf quality of a decision path provided valuable references for resolving contradicting predictions and a single tree model representation performed nearly equally well to the multi-tree model representation.},
Audit-trail = {Reconstructed paper posted 11/10/05},
Keywords = {Feature Based Modeling and User Modeling},
Location = {Perth, Australia},
Related = {feature-based-modeling}
}
ABSTRACT Input-Output Agent Modelling (IOAM) is an approach to modelling an agent in terms of relationships between the inputs and outputs of the cognitive system. This approach, together with a leading inductive learning algorithm, C4.5, has been adopted to build a subtraction skill modeller, C4.5-IOAM. It models agents' competencies with a set of decision trees. C4.5-IOAM makes no prediction when predictions from different decision trees are contradictory. This paper proposes three techniques for resolving such situations. Two techniques involve selecting the more reliable prediction from a set of competing predictions using a free quality measure and a leaf quality measure. The other technique merges multiple decision trees into a single tree. This has the additional advantage of producing more comprehensible models. Experimental results, in the domain of modelling elementary subtraction skills, showed that the tree quality and the leaf quality of a decision path provided valuable references for resolving contradicting predictions and a single tree model representation performed nearly equally well to the multi-tree model representation.

Webb, G. I. (1996). Integrating Machine Learning With Knowledge Acquisition Through Direct Interaction With Domain Experts. Knowledge-Based Systems, 9, 253-266.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb96a,
Title = {Integrating Machine Learning With Knowledge Acquisition Through Direct Interaction With Domain Experts},
Author = {G. I. Webb},
Journal = {Knowledge-Based Systems},
Year = {1996},
Pages = {253-266},
Volume = {9},
Abstract = {Knowledge elicitation from experts and empirical machine learning are two distinct approaches to knowledge acquisition with differing and mutually complementary capabilities. Learning apprentices have provided environments in which a knowledge engineer may collaborate with a machine learning system allowing, for a synergy between the complementary approaches. The Knowledge Factory is a knowledge acquisition environment that allows a domain expert to collaborate directly with a machine learning system without the need for assistance from a knowledge engineer. This requires a different form of environment to the learning apprentice. This paper describes techniques for supporting such interactions and their implementation in a knowledge acquisition environment called The Knowledge Factory.},
Audit-trail = {Link via Science Direct},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
Publisher = {Elsevier},
Related = {interactive-machine-learning}
}
ABSTRACT Knowledge elicitation from experts and empirical machine learning are two distinct approaches to knowledge acquisition with differing and mutually complementary capabilities. Learning apprentices have provided environments in which a knowledge engineer may collaborate with a machine learning system allowing, for a synergy between the complementary approaches. The Knowledge Factory is a knowledge acquisition environment that allows a domain expert to collaborate directly with a machine learning system without the need for assistance from a knowledge engineer. This requires a different form of environment to the learning apprentice. This paper describes techniques for supporting such interactions and their implementation in a knowledge acquisition environment called The Knowledge Factory.

Webb, G. I. (1996). Inclusive Pruning: A New Class of Pruning Rule for Unordered Search and its Application to Classification Learning. Australian Computer Science Communications Vol. 18 (1): Proceedings of the Nineteenth Australasian Computer Science Conference (ACSC'96), Melbourne, pp. 1-10.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb96e,
Title = {Inclusive Pruning: A New Class of Pruning Rule for Unordered Search and its Application to Classification Learning},
Author = {G. I. Webb},
Booktitle = {Australian Computer Science Communications Vol. 18 (1): Proceedings of the Nineteenth Australasian Computer Science Conference (ACSC'96)},
Year = {1996},
Address = {Melbourne},
Editor = {K. Ramamohanarao},
Pages = {1-10},
Publisher = {ACS},
Abstract = {This paper presents a new class of pruning rule for unordered search. Previous pruning rules for unordered search identify operators that should not be applied in order to prune nodes reached via those operators. In contrast, the new pruning rules identify operators that should be applied and prune nodes that are not reached via those operators. Specific pruning rules employing both these approaches are identified for classification learning. Experimental results demonstrate that application of the new pruning rules can reduce by more than 60% the number of states from the search space that are considered during classification learning.},
Audit-trail = {*},
Keywords = {Search and Rule Learning and OPUS and Association Rule Discovery},
Location = {Royal Melbourne Insitute of Technology, Australia},
Related = {opus-search}
}
ABSTRACT This paper presents a new class of pruning rule for unordered search. Previous pruning rules for unordered search identify operators that should not be applied in order to prune nodes reached via those operators. In contrast, the new pruning rules identify operators that should be applied and prune nodes that are not reached via those operators. Specific pruning rules employing both these approaches are identified for classification learning. Experimental results demonstrate that application of the new pruning rules can reduce by more than 60% the number of states from the search space that are considered during classification learning.

Webb, G. I. (1996). Cost Sensitive Specialisation. Lecture Notes in Computer Science Vol. 1114. Topics in Artificial Intelligence: Proceedings of the Fourth Pacific Rim International Conference on Artificial Intelligence (PRICAI'96), Berlin/Heidelberg, pp. 23-34.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb96d,
Title = {Cost Sensitive Specialisation},
Author = {G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 1114. Topics in Artificial Intelligence: Proceedings of the Fourth {Pacific} Rim International Conference on Artificial Intelligence (PRICAI'96)},
Year = {1996},
Address = {Berlin/Heidelberg},
Editor = {N.Y. Foo and R. Goebel},
Pages = {23-34},
Publisher = {Springer-Verlag},
Abstract = {Cost-sensitive specialization is a generic technique for misclassification cost sensitive induction. This technique involves specializing aspects of a classifier associated with high misclassification costs and generalizing those associated with low misclassification costs. It is widely applicable and simple to implement. It could be used to augment the effect of standard cost-sensitive induction techniques. It should directly extend to test application cost sensitive induction tasks. Experimental evaluation demonstrates consistent positive effects over a range of misclassification cost sensitive learning tasks.},
Audit-trail = {*},
Keywords = {Cost Sensitive Learning and Generality},
Location = {Cairns, Australia},
Related = {generality-is-predictive-of-prediction-accuracy}
}
ABSTRACT Cost-sensitive specialization is a generic technique for misclassification cost sensitive induction. This technique involves specializing aspects of a classifier associated with high misclassification costs and generalizing those associated with low misclassification costs. It is widely applicable and simple to implement. It could be used to augment the effect of standard cost-sensitive induction techniques. It should directly extend to test application cost sensitive induction tasks. Experimental evaluation demonstrates consistent positive effects over a range of misclassification cost sensitive learning tasks.

Webb, G. I., & Kuzmycz, M. (1996). Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competencies. User Modelling and User-Adapted Interaction, 5(2), 117-150.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@Article{WebbKuzmycz96,
Title = {Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competencies},
Author = {G.I. Webb and M. Kuzmycz},
Journal = {User Modelling and User-Adapted Interaction},
Year = {1996},
Number = {2},
Pages = {117-150},
Volume = {5},
Abstract = {Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90% when predicting student solutions. It also demonstrates the ability to identify and model student's buggy arithmetic procedures.},
Address = {Netherlands},
Audit-trail = {Kluwer Online publications only available from Dec 1997 onwards. Not found via {ACM} Portal},
Keywords = {Feature Based Modeling},
Publisher = {Springer},
Related = {feature-based-modeling}
}
ABSTRACT Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90% when predicting student solutions. It also demonstrates the ability to identify and model student's buggy arithmetic procedures.

Webb, G. I. (1996). Further Experimental Evidence Against The Utility Of Occam's Razor. Journal of Artificial Intelligence Research, 4, 397-417.
[DOI] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb96b,
Title = {Further Experimental Evidence Against The Utility Of {Occam}'s Razor},
Author = {G. I. Webb},
Journal = {Journal of Artificial Intelligence Research},
Year = {1996},
Pages = {397-417},
Volume = {4},
Abstract = {This paper presents new experimental evidence against the utility of Occam's razor. A systematic procedure is presented for post-processing decision trees produced by C4.5. This procedure was derived by rejecting Occam's razor and instead attending to the assumption that similar objects are likely to belong to the same class. It increases a decision tree's complexity without altering the performance of that tree on the training data from which it is inferred. The resulting more complex decision trees are demonstrated to have, on average, for a variety of common learning tasks, higher predictive accuracy than the less complex original decision trees. This result raises considerable doubt about the utility of Occam's razor as it is commonly applied in modern machine learning.},
Address = {Menlo Park, CA},
Audit-trail = {Link to paper via JAIR website},
Doi = {10.1613/jair.228},
Keywords = {Decision Trees and Decision Tree Grafting and Occams Razor},
Publisher = {AAAI Press},
Related = {occams-razor-in-machine-learning}
}
ABSTRACT This paper presents new experimental evidence against the utility of Occam's razor. A systematic procedure is presented for post-processing decision trees produced by C4.5. This procedure was derived by rejecting Occam's razor and instead attending to the assumption that similar objects are likely to belong to the same class. It increases a decision tree's complexity without altering the performance of that tree on the training data from which it is inferred. The resulting more complex decision trees are demonstrated to have, on average, for a variety of common learning tasks, higher predictive accuracy than the less complex original decision trees. This result raises considerable doubt about the utility of Occam's razor as it is commonly applied in modern machine learning.

Webb, G. I., & Wells, J. (1996). An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition Through Direct Interaction with Domain Experts. Proceedings of the 1996 Pacific Knowledge Acquisition Workshop (PKAW'96), Sydney, pp. 170-189.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbWells96,
Title = {An Experimental Evaluation of Integrating Machine Learning with Knowledge Acquisition Through Direct Interaction with Domain Experts},
Author = {G.I. Webb and J. Wells},
Booktitle = {Proceedings of the 1996 {Pacific} Knowledge Acquisition Workshop (PKAW'96)},
Year = {1996},
Address = {Sydney},
Editor = {P. Compton and R. Mizoguchi and H. Motada and T. Menzies},
Pages = {170-189},
Publisher = {UNSW Press},
Abstract = {Machine learning and knowledge acquisition from experts have distinct and apparently complementary knowledge acquisition capabilities. This study demonstrates that the integration of these approaches can both improve the accuracy of the knowledge base that is developed and reduce the time taken to develop it. The system studied, called The Knowledge Factory is distinguished by the manner in which it supports direct interaction with domain experts with little or no knowledge engineering expertise. The benefits reported relate to use by such users. In addition to the improved quality of the knowledge base, in questionnaire responses the users provided favourable evaluations of the integration of machine learning with knowledge acquisition within the system.},
Audit-trail = {Reconstructed paper posted April 2006},
Keywords = {Machine Learning with Knowledge Acquisition from Experts and Rule Learning},
Location = {Coogee, Sydney, Australia},
Related = {interactive-machine-learning}
}
ABSTRACT Machine learning and knowledge acquisition from experts have distinct and apparently complementary knowledge acquisition capabilities. This study demonstrates that the integration of these approaches can both improve the accuracy of the knowledge base that is developed and reduce the time taken to develop it. The system studied, called The Knowledge Factory is distinguished by the manner in which it supports direct interaction with domain experts with little or no knowledge engineering expertise. The benefits reported relate to use by such users. In addition to the improved quality of the knowledge base, in questionnaire responses the users provided favourable evaluations of the integration of machine learning with knowledge acquisition within the system.

Webb, G. I. (1996). A Heuristic Covering Algorithm Outperforms Learning All Rules. Proceedings of Information, Statistics and Induction in Science (ISIS '96), Singapore, pp. 20-30.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb96c,
Title = {A Heuristic Covering Algorithm Outperforms Learning All Rules},
Author = {G. I. Webb},
Booktitle = {Proceedings of Information, Statistics and Induction in Science (ISIS '96)},
Year = {1996},
Address = {Singapore},
Pages = {20-30},
Publisher = {World Scientific},
Abstract = {The induction of classification rules has been dominated by a single generic technique-the covering algorithm. This approach employs a simple hill-climbing search to learn sets of rules. Such search is subject to numerous widely known deficiencies. Further, there is a growing body of evidence that learning redundant sets of rules can improve predictive accuracy. The ultimate end-point of a move toward learning redundant rule sets would appear to be to learn and employ all possible rules. This paper presents a learning system that does this. An empirical investigation shows that, while the approach often achieves higher predictive accuracy than a covering algorithm, the covering algorithm outperforms induction of all rules significantly more frequently. Preliminary analysis suggests that learning all rules performs well when the training set clearly defines the decision surfaces but that the heuristic covering algorithm performs better when the decision surfaces are not clearly delineated by the training examples.},
Audit-trail = {*},
Keywords = {Lazy Learning and Rule Learning},
Location = {Melbourne, Australia}
}
ABSTRACT The induction of classification rules has been dominated by a single generic technique-the covering algorithm. This approach employs a simple hill-climbing search to learn sets of rules. Such search is subject to numerous widely known deficiencies. Further, there is a growing body of evidence that learning redundant sets of rules can improve predictive accuracy. The ultimate end-point of a move toward learning redundant rule sets would appear to be to learn and employ all possible rules. This paper presents a learning system that does this. An empirical investigation shows that, while the approach often achieves higher predictive accuracy than a covering algorithm, the covering algorithm outperforms induction of all rules significantly more frequently. Preliminary analysis suggests that learning all rules performs well when the training set clearly defines the decision surfaces but that the heuristic covering algorithm performs better when the decision surfaces are not clearly delineated by the training examples.

Smith, P. A., & Webb, G. I. (1995). Transparency Debugging with Explanations for Novice Programmers. Proceedings of the Second International Workshop on Automated and Algorithmic Debugging (AADEBUG'95).
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{SmithWebb95a,
Title = {Transparency Debugging with Explanations for Novice Programmers},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {Proceedings of the Second International Workshop on Automated and Algorithmic Debugging (AADEBUG'95)},
Year = {1995},
Editor = {M. Ducass‰},
Publisher = {IRISA-CNRS},
Abstract = {Novice programmers often find programming to be a difficult and frustrating task. Because of their lack of experience in programming novices have different needs to experts when it comes to debugging assistants. One way a debugging assistant could be tailored to novices, as proposed by Eisenstadt, is to provide them with an explicit model of how their program works and, hence encourage them to find errors for themselves. We discuss such a transparency debugger, Bradman, that we have been developing to assist novice programmers understand and debug their C programs. We also present the results of an experiment, conducted on volunteer novice programmers, in which approximately half of the students had access to an explanation of each statement as it was executed and the other half did not. We show that access to such explanations provided beneficial results for a significant number of students.},
Keywords = {Program Visualisation},
Location = {Saint-Malo, France},
Related = {program-visualisation}
}
ABSTRACT Novice programmers often find programming to be a difficult and frustrating task. Because of their lack of experience in programming novices have different needs to experts when it comes to debugging assistants. One way a debugging assistant could be tailored to novices, as proposed by Eisenstadt, is to provide them with an explicit model of how their program works and, hence encourage them to find errors for themselves. We discuss such a transparency debugger, Bradman, that we have been developing to assist novice programmers understand and debug their C programs. We also present the results of an experiment, conducted on volunteer novice programmers, in which approximately half of the students had access to an explanation of each statement as it was executed and the other half did not. We show that access to such explanations provided beneficial results for a significant number of students.

Webb, G. I., & Wells, J. (1995). Recent Progress in Machine-Expert Collaboration for Knowledge Acquisition. Proceedings of the Eighth Australian Joint Conference on Artificial Intelligence (AI'95), Singapore, pp. 291-298.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbWells95,
Title = {Recent Progress in Machine-Expert Collaboration for Knowledge Acquisition},
Author = {G. I. Webb and J. Wells},
Booktitle = {Proceedings of the Eighth Australian Joint Conference on Artificial Intelligence (AI'95)},
Year = {1995},
Address = {Singapore},
Editor = {X. Yao},
Pages = {291-298},
Publisher = {World Scientific},
Abstract = {Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.},
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Canberra, Australia},
Related = {interactive-machine-learning}
}
ABSTRACT Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.

Webb, G. I. (1995). OPUS: An Efficient Admissible Algorithm For Unordered Search. Journal of Artificial Intelligence Research, 3, 431-465.
[URL] [Bibtex] [Abstract]  → Related papers and software

@Article{Webb95,
Title = {OPUS: An Efficient Admissible Algorithm For Unordered Search},
Author = {G. I. Webb},
Journal = {Journal of Artificial Intelligence Research},
Year = {1995},
Pages = {431-465},
Volume = {3},
Abstract = {OPUS is a branch and bound search algorithm that enables efficient admissible search through spaces for which the order of search operator application is not significant. The algorithm's search efficiency is demonstrated with respect to very large machine learning search spaces. The use of admissible search is of potential value to the machine learning community as it means that the exact learning biases to be employed for complex learning tasks can be precisely specified and manipulated. OPUS also has potential for application in other areas of artificial intelligence, notably, truth maintenance.},
Address = {Menlo Park, CA},
Audit-trail = {Link to paper via JAIR website},
Keywords = {Search and Rule Learning and OPUS and Association Rule Discovery},
Publisher = {AAAI Press},
Related = {opus-search},
Url = {http://dx.doi.org/10.1613/jair.227}
}
ABSTRACT OPUS is a branch and bound search algorithm that enables efficient admissible search through spaces for which the order of search operator application is not significant. The algorithm's search efficiency is demonstrated with respect to very large machine learning search spaces. The use of admissible search is of potential value to the machine learning community as it means that the exact learning biases to be employed for complex learning tasks can be precisely specified and manipulated. OPUS also has potential for application in other areas of artificial intelligence, notably, truth maintenance.

Newlands, D., & Webb, G. I. (1995). Polygonal Inductive Generalisation System. Proceedings of the Eighth International Conference on Industrial and Engineering Applications of Artificial Intelligence and Expert Systems (IEA/AIE '95), Newark, NJ, USA, pp. 587-592.
[PDF] [Bibtex] [Abstract]

@InProceedings{NewlandsWebb95,
Title = {Polygonal Inductive Generalisation System},
Author = {D. Newlands and G. I. Webb},
Booktitle = {Proceedings of the Eighth International Conference on Industrial and Engineering Applications of Artificial Intelligence and Expert Systems (IEA/AIE '95)},
Year = {1995},
Address = {Newark, NJ, USA},
Editor = {G. Forsyth and M. Ali},
Pages = {587-592},
Publisher = {Gordon and Breach Science Publishers, Inc},
Abstract = {Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.},
Audit-trail = {*},
Keywords = {Convex Hulls},
Location = {Melbourne, Australia}
}
ABSTRACT Knowledge acquisition remains one of the primary constraints on the development of expert systems. A number of researchers have explored methods for allowing a machine learning system to assist a knowledge engineer in knowledge acquisition. In contrast, we are exploring methods for enabling an expert to directly interact with a machine learning system to collaborate during knowledge acquisition. We report recent extensions to our methodology encompassing a revised model of the role of machine learning in knowledge acquisition; techniques for communication between a machine learning system and a domain expert and novel forms of assistance that a machine learning system may provide to an expert.

Smith, P. A., & Webb, G. I. (1995). Reinforcing a Generic Computer Model for Novice Programmers. Proceedings of the Seventh Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '95), Melbourne.
[PDF] [Bibtex] [Abstract]

@InProceedings{SmithWebb95b,
Title = {Reinforcing a Generic Computer Model for Novice Programmers},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {Proceedings of the Seventh Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '95)},
Year = {1995},
Address = {Melbourne},
Publisher = {ASCILITE},
Abstract = {Novices often find learning their first programming language to be a frustrating and difficult process. They have difficulties in developing and debugging their programs. One of their problems is that their mental model of how the computer works is inadequate. In this paper we discuss a programming assistant, called Bradman, which we are currently developing. It is aimed at novice programmers and designed to reinforce a concrete mental model of how the computer works as a program is executed. It shows explicitly how program states change as statements in the procedural language C are executed. It does this by means of graphical display together with contextualised verbal explanations of each statement.},
Keywords = {Program Visualisation},
Location = {Melbourne, Australia}
}
ABSTRACT Novices often find learning their first programming language to be a frustrating and difficult process. They have difficulties in developing and debugging their programs. One of their problems is that their mental model of how the computer works is inadequate. In this paper we discuss a programming assistant, called Bradman, which we are currently developing. It is aimed at novice programmers and designed to reinforce a concrete mental model of how the computer works as a program is executed. It shows explicitly how program states change as statements in the procedural language C are executed. It does this by means of graphical display together with contextualised verbal explanations of each statement.

Yip, S., & Webb, G. I. (1994). Empirical Function Attribute Construction in Classification Learning. Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94), Singapore, pp. 29-36.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YipWebb94a,
Title = {Empirical Function Attribute Construction in Classification Learning},
Author = {S. Yip and G. I. Webb},
Booktitle = {Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94)},
Year = {1994},
Address = {Singapore},
Editor = {C. Zhang and J. Debenham and D. Lukose},
Pages = {29-36},
Publisher = {World Scientific},
Abstract = {The merits of incorporating feature construction to assist selective induction in learning hard concepts are well documented. This paper introduces the notion of function attributes and reports a method of incorporating functional regularities in classifiers. Training sets are preprocessed with this method before submission to a selective induction classification learning system. The method, referred to as FAFA (function attribute finding), is characterised by finding bivariate functions that contribute to the discrimination between classes and then transforming them to function attributes as additional attributes of the data set. The value of each function attribute equals the deviation of each example from the value obtained by applying that function to the example. The expanded data set is then submitted to classification learning. Evaluation with published and artificial data shows that this method can improve classifiers in terms of predictive accuracy and complexity.},
Keywords = {Constructive Induction},
Location = {Armidale,NSW, Australia},
Related = {feature-construction}
}
ABSTRACT The merits of incorporating feature construction to assist selective induction in learning hard concepts are well documented. This paper introduces the notion of function attributes and reports a method of incorporating functional regularities in classifiers. Training sets are preprocessed with this method before submission to a selective induction classification learning system. The method, referred to as FAFA (function attribute finding), is characterised by finding bivariate functions that contribute to the discrimination between classes and then transforming them to function attributes as additional attributes of the data set. The value of each function attribute equals the deviation of each example from the value obtained by applying that function to the example. The expanded data set is then submitted to classification learning. Evaluation with published and artificial data shows that this method can improve classifiers in terms of predictive accuracy and complexity.

Yip, S., & Webb, G. I. (1994). Incorporating Canonical Discriminate Attributes in Classification Learning. Proceedings of the Tenth Biennial Canadian Artificial Intelligence Conference(AI-94), San Francisco, pp. 63-70.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YipWebb94b,
Title = {Incorporating Canonical Discriminate Attributes in Classification Learning},
Author = {S. Yip and G. I. Webb},
Booktitle = {Proceedings of the Tenth Biennial Canadian Artificial Intelligence Conference(AI-94)},
Year = {1994},
Address = {San Francisco},
Editor = {R. Elio},
Pages = {63-70},
Publisher = {Morgan Kaufmann},
Abstract = {This paper describes a method for incorporating canonical discriminant attributes in classification machine learning. Though decision trees and rules have semantic appeal when building expert systems, the merits of discriminant analysis are well documented. For data sets on which discriminant analysis obtains significantly better predictive accuracy than symbolic machine learning, the incorporation of canonical discriminant attributes can benefit machine learning. The process starts by applying canonical discriminant analysis to the training set. The canonical discriminant attributes are included as additional attributes. The expanded data set is then subjected to machine learning. This enables linear combinations of numeric attributes to be incorporated in the classifiers that are learnt. Evaluation on the data sets on which discriminant analysis performs better than most machine learning systems, such as the Iris flowers and Waveform data sets, shows that incorporating the power of discriminant analysis in machine classification learning can significantly improve the predictive accuracy and reduce the complexity of classifiers induced by machine learning systems.},
Keywords = {Constructive Induction},
Location = {Banff, Canada},
Related = {feature-construction}
}
ABSTRACT This paper describes a method for incorporating canonical discriminant attributes in classification machine learning. Though decision trees and rules have semantic appeal when building expert systems, the merits of discriminant analysis are well documented. For data sets on which discriminant analysis obtains significantly better predictive accuracy than symbolic machine learning, the incorporation of canonical discriminant attributes can benefit machine learning. The process starts by applying canonical discriminant analysis to the training set. The canonical discriminant attributes are included as additional attributes. The expanded data set is then subjected to machine learning. This enables linear combinations of numeric attributes to be incorporated in the classifiers that are learnt. Evaluation on the data sets on which discriminant analysis performs better than most machine learning systems, such as the Iris flowers and Waveform data sets, shows that incorporating the power of discriminant analysis in machine classification learning can significantly improve the predictive accuracy and reduce the complexity of classifiers induced by machine learning systems.

Webb, G. I. (1994). Recent Progress in Learning Decision Lists by Prepending Inferred Rules. Proceedings of the Second Singapore International Conference on Intelligent Systems (SPICIS-94), Singapore, pp. B280-B285.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb94a,
Title = {Recent Progress in Learning Decision Lists by Prepending Inferred Rules},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Second Singapore International Conference on Intelligent Systems (SPICIS-94)},
Year = {1994},
Address = {Singapore},
Pages = {B280-B285},
Publisher = {{Asia} Computer Weekly},
Abstract = {This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to the front of the list under construction. By contrast, the classic algorithm operates by appending successive rules to the end of the decision list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy in less time than the classic algorithm.},
Audit-trail = {*},
Keywords = {Rule Learning and Prepend},
Location = {Singapore},
Related = {prepending}
}
ABSTRACT This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to the front of the list under construction. By contrast, the classic algorithm operates by appending successive rules to the end of the decision list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy in less time than the classic algorithm.

Webb, G. I. (1994). Generality Is More Significant Then Complexity: Toward An Alternative To Occams Razor. Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94), Singapore, pp. 60-67.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb94b,
Title = {Generality Is More Significant Then Complexity: Toward An Alternative To Occams Razor},
Author = {G. I. Webb},
Booktitle = {Artificial Intelligence: Sowing the Seeds for the Future, Proceedings of Seventh Australian Joint Conference on Artificial Intelligence (AI'94)},
Year = {1994},
Address = {Singapore},
Editor = {C. Zhang and J. Debenham and D. Lukose},
Pages = {60-67},
Publisher = {World Scientific},
Abstract = {Occam's Razor is widely employed in machine learning to select between classifiers with equal empirical support. This paper presents the theorem of decreasing inductive power: that, all other things being equal, if two classifiers a and b cover identical cases from the training set and a is a generalisation of b, a has higher probability than b of misclassifying a previously unsighted case. This theorem suggests that, to the contrary of Occam's Razor, generality, not complexity, should be used to select between classifiers with equal empirical support. Two studies are presented. The first study demonstrates that the theorem of decreasing inductive power holds for a number of commonly studied learning problems and for a number of different means of manipulating classifier generality. The second study demonstrates that generality provides a more consistent indicator of predictive accuracy in the context of a default rule than does complexity. These results suggest that the theorem of decreasing predictive power provides a suitable theoretical framework for the development of learning biases for use in selecting between classifiers with identical empirical support},
Audit-trail = {*},
Keywords = {Occams Razor and Rule Learning and Generality},
Location = {Armidale,NSW, Australia},
Related = {occams-razor-in-machine-learning}
}
ABSTRACT Occam's Razor is widely employed in machine learning to select between classifiers with equal empirical support. This paper presents the theorem of decreasing inductive power: that, all other things being equal, if two classifiers a and b cover identical cases from the training set and a is a generalisation of b, a has higher probability than b of misclassifying a previously unsighted case. This theorem suggests that, to the contrary of Occam's Razor, generality, not complexity, should be used to select between classifiers with equal empirical support. Two studies are presented. The first study demonstrates that the theorem of decreasing inductive power holds for a number of commonly studied learning problems and for a number of different means of manipulating classifier generality. The second study demonstrates that generality provides a more consistent indicator of predictive accuracy in the context of a default rule than does complexity. These results suggest that the theorem of decreasing predictive power provides a suitable theoretical framework for the development of learning biases for use in selecting between classifiers with identical empirical support

Webb, G. I. (1993). Systematic Search for Categorical Attribute-Value Data-Driven Machine Learning. Proceedings of the Sixth Australian Joint Conference on Artificial Intelligence (AI'93), Singapore, pp. 342-347.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb93a,
Title = {Systematic Search for Categorical Attribute-Value Data-Driven Machine Learning},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Sixth Australian Joint Conference on Artificial Intelligence (AI'93)},
Year = {1993},
Address = {Singapore},
Editor = {C. Rowles and H. Liu and N. Foo},
Pages = {342-347},
Publisher = {World Scientific},
Abstract = {Optimal Pruning for Unordered Search is a search algorithm that enables complete search through the space of possible disjuncts at the inner level of a covering algorithm. This algorithm takes as inputs an evaluation function, e, a training set, t, and a set of specialisation operators, o. It outputs a set of operators from o that creates a classifier that maximises e with respect to t. While OPUS has exponential worst case time complexity, the algorithm is demonstrated to reach solutions for complex real world domains within reasonable time frames. Indeed, for some domains, the algorithm exhibits greater computational efficiency than common heuristic search algorithms.},
Audit-trail = {*},
Keywords = {Search and Rule Learning and OPUS},
Location = {Melbourne, Australia},
Related = {opus-search}
}
ABSTRACT Optimal Pruning for Unordered Search is a search algorithm that enables complete search through the space of possible disjuncts at the inner level of a covering algorithm. This algorithm takes as inputs an evaluation function, e, a training set, t, and a set of specialisation operators, o. It outputs a set of operators from o that creates a classifier that maximises e with respect to t. While OPUS has exponential worst case time complexity, the algorithm is demonstrated to reach solutions for complex real world domains within reasonable time frames. Indeed, for some domains, the algorithm exhibits greater computational efficiency than common heuristic search algorithms.

Webb, G. I. (1993). Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competency. Proceedings of the 1993 World Conference on Artificial Intelligence in Education (AI-ED'93), Charlottesville, VA, pp. 497-504.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb93c,
Title = {Feature Based Modelling: A Methodology for Producing Coherent, Consistent, Dynamically Changing Models of Agents Competency},
Author = {G. I. Webb},
Booktitle = {Proceedings of the 1993 World Conference on Artificial Intelligence in Education (AI-ED'93)},
Year = {1993},
Address = {Charlottesville, VA},
Editor = {P. Brna and S. Ohlsson and H. Pain },
Pages = {497-504},
Publisher = {AACE},
Abstract = {Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90\% when predicting student solutions. It also demonstrates the ability to identify and model students' buggy arithmetic procedures.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Case Based Learning},
Location = {Edinburgh, Scotland. Also published in User Modeling and User-Adapted Interaction. 5: 117-150, 1996},
Related = {feature-based-modeling}
}
ABSTRACT Feature Based Modelling uses attribute value machine learning techniques to model an agent's competency. This is achieved by creating a model describing the relationships between the features of the agent's actions and of the contexts in which those actions are performed. This paper describes techniques that have been developed for creating these models and for extracting key information therefrom. An overview is provided of previous studies that have evaluated the application of Feature Based Modelling in a number of educational contexts including piano keyboard playing, the unification of Prolog terms and elementary subtraction. These studies have demonstrated that the approach is applicable to a wide spectrum of domains. Classroom use has demonstrated the low computational overheads of the technique. A new study of the application of the approach to modelling elementary subtraction skills is presented. The approach demonstrates accuracy in excess of 90\% when predicting student solutions. It also demonstrates the ability to identify and model students' buggy arithmetic procedures.

Webb, G. I. (1993). Control, Capabilities and Communication: Three Key Issues for Machine-Expert Collaborative Knowledge Acquisition. Proceedings (Complement) of the Seventh European Workshop on Knowledge Acquisition for Knowledge-based Systems (EWKA'93), pp. 263-275.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb93e,
Title = {Control, Capabilities and Communication: Three Key Issues for Machine-Expert Collaborative Knowledge Acquisition},
Author = {G. I. Webb},
Booktitle = {Proceedings (Complement) of the Seventh European Workshop on Knowledge Acquisition for Knowledge-based Systems (EWKA'93)},
Year = {1993},
Editor = {N. Aussenac and G. Boy and B. Gaines and M. Linster and J.G. Ganascia and Y. Kodratoff },
Pages = {263-275},
Publisher = {Springer-Verlag},
Abstract = {Machine learning and knowledge elicitation are different but complementary approaches to knowledge acquisition. On the face of it there are large potential gains to be reaped from the integration of these two knowledge acquisition techniques. Machine-expert collaborative knowledge acquisition combines these approaches by placing the machine learning system and the human expert as partners in the knowledge-acquisition task. This paper examines three key issues facing machine-expert collaborative knowledge-acquisition where should control reside, what capabilities should each partner bring to the task and how should the partners communicate? },
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Toulouse, France},
Related = {interactive-machine-learning}
}
ABSTRACT Machine learning and knowledge elicitation are different but complementary approaches to knowledge acquisition. On the face of it there are large potential gains to be reaped from the integration of these two knowledge acquisition techniques. Machine-expert collaborative knowledge acquisition combines these approaches by placing the machine learning system and the human expert as partners in the knowledge-acquisition task. This paper examines three key issues facing machine-expert collaborative knowledge-acquisition where should control reside, what capabilities should each partner bring to the task and how should the partners communicate?

Webb, G. I., & Brkic, N. (1993). Learning Decision Lists by Prepending Inferred Rules. Proceedings of the AI 93 Workshop on Machine Learning and Hybrid Systems, pp. 6-10.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbBrkic93,
Title = {Learning Decision Lists by Prepending Inferred Rules},
Author = {G. I. Webb and N. Brkic},
Booktitle = {Proceedings of the AI 93 Workshop on Machine Learning and Hybrid Systems},
Year = {1993},
Editor = {S. Sestito},
Pages = {6-10},
Abstract = {This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to front of the list under construction. This contrasts with the original decision list induction algorithm which operates by appending successive rules to end of the list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy than those produced by the original decision list induction algorithm.},
Audit-trail = {*},
Keywords = {Prepend and Rule Learning},
Location = {Melbourne, Australia},
Related = {prepending}
}
ABSTRACT This paper describes a new algorithm for learning decision lists that operates by prepending successive rules to front of the list under construction. This contrasts with the original decision list induction algorithm which operates by appending successive rules to end of the list under construction. The new algorithm is demonstrated in the majority of cases to produce smaller classifiers that provide improved predictive accuracy than those produced by the original decision list induction algorithm.

Webb, G. I. (1993). DLGref2: Techniques for Inductive Rule Refinement. Proceedings of the 1993 IJCAI Workshop W16: Machine Learning and Knowledge Acquisition, pp. 236-252.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb93d,
Title = {DLGref2: Techniques for Inductive Rule Refinement},
Author = {G.I. Webb},
Booktitle = {Proceedings of the 1993 {IJCAI} Workshop W16: Machine Learning and Knowledge Acquisition},
Year = {1993},
Pages = {236-252},
Abstract = {This paper describes and evaluates machine learning techniques for knowledge-base refinement. These techniques are central to Einstein, a knowledge acquisition system that enables a human expert to collaborate with a machine learning system at all stages of the knowledge-acquisition cycle. Experimental evaluation demonstrates that the knowledge-base refinement techniques are able to significantly increase the accuracy of nontrivial expert systems in a wide variety of domains.},
Audit-trail = {Reconstructed paper posted May 2005},
Keywords = {Rule Learning},
Location = {Chambery, France}
}
ABSTRACT This paper describes and evaluates machine learning techniques for knowledge-base refinement. These techniques are central to Einstein, a knowledge acquisition system that enables a human expert to collaborate with a machine learning system at all stages of the knowledge-acquisition cycle. Experimental evaluation demonstrates that the knowledge-base refinement techniques are able to significantly increase the accuracy of nontrivial expert systems in a wide variety of domains.

Webb., G. I., & Agar, J. (1992). Inducing Diagnostic Rules For Glomerular Disease With The DLG Machine Learning Algorithm. Artificial Intelligence in Medicine, 4(6), 419-430.
[DOI] [Bibtex] [Abstract]

@Article{WebbAgar92,
Title = {Inducing Diagnostic Rules For Glomerular Disease With The DLG Machine Learning Algorithm},
Author = {G. I. Webb. and J. Agar},
Journal = {Artificial Intelligence in Medicine},
Year = {1992},
Number = {6},
Pages = {419-430},
Volume = {4},
Abstract = {A pilot study has applied the DLG machine learning algorithm to create expert systems for the assessment and interpretation of clinical and laboratory data in glomerular disease. Despite the limited size of the data-set and major deficiencies in the information recorded therein, promising results have been obtained. On average, 100 expert systems developed from different subsets of the database, had a diagnostic accuracy of 54.70% when applied to cases that had not been used in their development. This compares with an average diagnostic accuracy of 48.31% obtained by four expert clinicians and of 3.23% obtained by random diagnosis. The expert systems demonstrated increased accuracy (62.90% on average) when cases of diseases represented by less than twenty examples were discarded. These results suggest that database expansion may enable the induction of diagnostic rules that provide accurate non-invasive diagnosis of specific categories of glomerular disease.},
Audit-trail = {Only on-line since 1995 via Science Direct},
Doi = {10.1016/0933-3657(92)90010-M},
Keywords = {Rule Learning},
Publisher = {Elsevier}
}
ABSTRACT A pilot study has applied the DLG machine learning algorithm to create expert systems for the assessment and interpretation of clinical and laboratory data in glomerular disease. Despite the limited size of the data-set and major deficiencies in the information recorded therein, promising results have been obtained. On average, 100 expert systems developed from different subsets of the database, had a diagnostic accuracy of 54.70% when applied to cases that had not been used in their development. This compares with an average diagnostic accuracy of 48.31% obtained by four expert clinicians and of 3.23% obtained by random diagnosis. The expert systems demonstrated increased accuracy (62.90% on average) when cases of diseases represented by less than twenty examples were discarded. These results suggest that database expansion may enable the induction of diagnostic rules that provide accurate non-invasive diagnosis of specific categories of glomerular disease.

Webb, G. I. (1992). Man-Machine Collaboration for Knowledge Acquisition. Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92), Singapore, pp. 329-334.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb92,
Title = {Man-Machine Collaboration for Knowledge Acquisition},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92)},
Year = {1992},
Address = {Singapore},
Editor = {A. Adams and L. Sterling},
Pages = {329-334},
Publisher = {World Scientific},
Abstract = {Both machine learning and knowledge elicitation from human experts have unique strengths and weaknesses. Man-machine collaboration for knowledge acquisition allows both knowledge acquisition techniques to be employed hand- in-hand. The strengths of each can alleviate the other's weaknesses. This has the potential to both reduce the time taken to develop an expert system while increasing the quality of the finished product. This paper discusses techniques for man-machine collaboration for knowledge acquisition and describes Einstein, a computer system that implements those techniques},
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Hobart, Tas., Australia},
Related = {interactive-machine-learning}
}
ABSTRACT Both machine learning and knowledge elicitation from human experts have unique strengths and weaknesses. Man-machine collaboration for knowledge acquisition allows both knowledge acquisition techniques to be employed hand- in-hand. The strengths of each can alleviate the other's weaknesses. This has the potential to both reduce the time taken to develop an expert system while increasing the quality of the finished product. This paper discusses techniques for man-machine collaboration for knowledge acquisition and describes Einstein, a computer system that implements those techniques

Yip, S., & Webb, G. I. (1992). Discriminate Attribute Finding in Classification Learning. Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92), Singapore, pp. 374-379.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YipWebb92b,
Title = {Discriminate Attribute Finding in Classification Learning},
Author = {S. Yip and G. I. Webb},
Booktitle = {Proceedings of the Fifth Australian Joint Conference on Artificial Intelligence (AI'92)},
Year = {1992},
Address = {Singapore},
Editor = {A. Adams and L. Sterling},
Pages = {374-379},
Publisher = {World Scientific},
Abstract = {This paper describes a method for extending domain models in classification learning by deriving new attributes from existing ones. The process starts by examining examples of different classes which have overlapping ranges in all of their numeric attribute values. Based on existing attributes, new attributes which enhance the distinguishability of a class are created. These additional attributes are then used in the subsequent classification learning process. The research revealed that this method can enable relationships between attributes to be incorporated in the classification procedures and, depending on the nature of data, significantly increase the coverage of class descriptions, improve the accuracy of classifying novel instances and reduce the number of clauses in class description when compared to classification learning alone. Evaluation with the data on iris flower classification showed that the classification accuracy is slightly improved and the number of clauses in the class description is significantly reduced.},
Keywords = {Constructive Induction},
Location = {Hobart, Tas., Australia},
Related = {feature-construction}
}
ABSTRACT This paper describes a method for extending domain models in classification learning by deriving new attributes from existing ones. The process starts by examining examples of different classes which have overlapping ranges in all of their numeric attribute values. Based on existing attributes, new attributes which enhance the distinguishability of a class are created. These additional attributes are then used in the subsequent classification learning process. The research revealed that this method can enable relationships between attributes to be incorporated in the classification procedures and, depending on the nature of data, significantly increase the coverage of class descriptions, improve the accuracy of classifying novel instances and reduce the number of clauses in class description when compared to classification learning alone. Evaluation with the data on iris flower classification showed that the classification accuracy is slightly improved and the number of clauses in the class description is significantly reduced.

Yip, S., & Webb, G. I. (1992). Function Finding in Classification Learning. Proceedings of the Second Pacific Rim International Conference on Artificial Intelligence (PRICAI '92), Berlin, pp. 555-561.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{YipWebb92a,
Title = {Function Finding in Classification Learning},
Author = {S. Yip and G. I. Webb},
Booktitle = {Proceedings of the Second {Pacific} Rim International Conference on Artificial Intelligence (PRICAI '92)},
Year = {1992},
Address = {Berlin},
Pages = {555-561},
Publisher = {Springer-Verlag},
Abstract = {The paper describes a method for extending domain models in classification learning by deriving new attributes from existing attributes. The process starts by finding functional regularities within each class. Such regularities are then treated as additional attributes in the subsequent classification learning process. The research revealed that these techniques can reduce the number of clauses required to describe each class, enable functional regularities between attributes to be incorporated in the classification procedures and, depending on the nature of data, significantly increase the coverage of class descriptions and improve the accuracy of classifying novel instances when compared to classification learning alone.},
Keywords = {Constructive Induction},
Location = {Seoul, Korea},
Related = {feature-construction}
}
ABSTRACT The paper describes a method for extending domain models in classification learning by deriving new attributes from existing attributes. The process starts by finding functional regularities within each class. Such regularities are then treated as additional attributes in the subsequent classification learning process. The research revealed that these techniques can reduce the number of clauses required to describe each class, enable functional regularities between attributes to be incorporated in the classification procedures and, depending on the nature of data, significantly increase the coverage of class descriptions and improve the accuracy of classifying novel instances when compared to classification learning alone.

Kuzmycz, M., & Webb, G. I. (1992). Evaluation of Feature Based Modelling in Subtraction. Lecture Notes in Computer Science Vol. 608: Proceedings of the Second International Conference on Intelligent Tutoring Systems (ITS'92), Berlin, pp. 269-276.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{KuzmyczWebb92,
Title = {Evaluation of Feature Based Modelling in Subtraction},
Author = {M. Kuzmycz and G. I. Webb},
Booktitle = {Lecture Notes in Computer Science Vol. 608: Proceedings of the Second International Conference on Intelligent Tutoring Systems (ITS'92)},
Year = {1992},
Address = {Berlin},
Editor = {C. Frasson and G. Gauthier and G. I. McCalla},
Pages = {269-276},
Publisher = {Springer-Verlag},
Abstract = {One aim of intelligent tutoring systems is to tailor lessons to each individual student's needs. To do this a tutoring system requires a model of the student's knowledge. Cognitive modelling aims to produce a detailed explanation of the student's progress. Feature Based Modelling forms a cognitive model of the student by creating aspects of problem descriptions and of students' responses. This paper will discuss Feature Based Modelling and show the results of an evaluation carried out in the domain of elemental subtraction.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Case Based Learning},
Location = {Montr‰al, Canada},
Related = {feature-based-modeling}
}
ABSTRACT One aim of intelligent tutoring systems is to tailor lessons to each individual student's needs. To do this a tutoring system requires a model of the student's knowledge. Cognitive modelling aims to produce a detailed explanation of the student's progress. Feature Based Modelling forms a cognitive model of the student by creating aspects of problem descriptions and of students' responses. This paper will discuss Feature Based Modelling and show the results of an evaluation carried out in the domain of elemental subtraction.

Agar, J., & Webb, G. I. (1992). Application Of Machine Learning To A Renal Biopsy Data-Base. Nephrology, Dialysis and Transplantation, 7, 472-478.
[URL] [Bibtex] [Abstract]

@Article{AgarWebb92,
Title = {Application Of Machine Learning To A Renal Biopsy Data-Base},
Author = {J. Agar and G. I. Webb},
Journal = {Nephrology, Dialysis and Transplantation},
Year = {1992},
Pages = {472-478},
Volume = {7},
Abstract = {This pilot study has applied machine learning (artificial intelligence derived qualitative analysis procedures) to yield non-invasive techniques for the assessment and interpretation of clinical and laboratory data in glomerular disease. To evaluate the appropriateness of these techniques, they were applied to subsets of a small database of 284 case histories and the resulting procedures evaluated against the remaining cases. Over such evaluations, the following average diagnostic accuracies were obtained: microscopic polyarteritis, 95.37%; minimal lesion nephrotic syndrome, 96.50%; immunoglobulin A nephropathy, 81.26%; minor changes, 93.66%; lupus nephritis, 96.27%; focal glomerulosclerosis, 92.06%; mesangial proliferative glomerulonephritis, 92.56%; and membranous nephropathy, 92.56%. Although in general the new diagnostic system is not yet as accurate as the histological evaluation of renal biopsy specimens, it shows promise of adding a further dimension to the diagnostic process. When the machine learning techniques are applied to a larger database, greater diagnostic accuracy should be obtained. It may allow accurate non- invasive diagnosis of some cases of glomerular disease without the need for renal biopsy. This may reduce both the cost and the morbidity of the investigation of glomerular disease and may be of particular value in situations where renal biopsy is considered hazardous or contraindicated.},
Address = {Oxford UK},
Audit-trail = {28/10/03 Link to abstract only at this stage available via Oxford Press.},
Keywords = {Rule Learning},
Publisher = {Oxford University Press},
Url = {http://ndt.oxfordjournals.org/content/7/6/472.abstract}
}
ABSTRACT This pilot study has applied machine learning (artificial intelligence derived qualitative analysis procedures) to yield non-invasive techniques for the assessment and interpretation of clinical and laboratory data in glomerular disease. To evaluate the appropriateness of these techniques, they were applied to subsets of a small database of 284 case histories and the resulting procedures evaluated against the remaining cases. Over such evaluations, the following average diagnostic accuracies were obtained: microscopic polyarteritis, 95.37%; minimal lesion nephrotic syndrome, 96.50%; immunoglobulin A nephropathy, 81.26%; minor changes, 93.66%; lupus nephritis, 96.27%; focal glomerulosclerosis, 92.06%; mesangial proliferative glomerulonephritis, 92.56%; and membranous nephropathy, 92.56%. Although in general the new diagnostic system is not yet as accurate as the histological evaluation of renal biopsy specimens, it shows promise of adding a further dimension to the diagnostic process. When the machine learning techniques are applied to a larger database, greater diagnostic accuracy should be obtained. It may allow accurate non- invasive diagnosis of some cases of glomerular disease without the need for renal biopsy. This may reduce both the cost and the morbidity of the investigation of glomerular disease and may be of particular value in situations where renal biopsy is considered hazardous or contraindicated.

Smith, P. A., & Webb, G. I. (1992). Recent progress in the Development of a Debugging Assistant for Computer Programs. A Future Promised: Proceedings of the Fifth Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '92), pp. 351-356.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{SmithWebb92,
Title = {Recent progress in the Development of a Debugging Assistant for Computer Programs},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {A Future Promised: Proceedings of the Fifth Australian Society for Computers in Learning in Tertiary Education Conference (ASCILITE '92)},
Year = {1992},
Editor = {. B. Chia and R. Pennell and R. Sims},
Pages = {351-356},
Abstract = {We present recent progress in the development of a debugging assistant for helping novices debug their computer programs. Bradman, which is still in the implementation phase, is an interactive system which builds two models of the user's program - one reflecting what the program actually does and the other reflecting what the programmer intended to do. Conflicts between these two models are used by Bradman to find bugs in the program.},
Keywords = {Program Visualisation},
Location = {Sydney, Australia},
Related = {program-visualisation}
}
ABSTRACT We present recent progress in the development of a debugging assistant for helping novices debug their computer programs. Bradman, which is still in the implementation phase, is an interactive system which builds two models of the user's program - one reflecting what the program actually does and the other reflecting what the programmer intended to do. Conflicts between these two models are used by Bradman to find bugs in the program.

Webb, G. I. (1991). An Attribute-Value Machine Learning Approach To Student Modelling. Proceedings of the IJCAI Workshop W.4: Agent Modelling for Intelligent Interaction, pp. 128-136.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb91e,
Title = {An Attribute-Value Machine Learning Approach To Student Modelling},
Author = {G.I. Webb},
Booktitle = {Proceedings of the {IJCAI} Workshop {W.4}: Agent Modelling for Intelligent Interaction},
Year = {1991},
Editor = {J. Kay and A. Quilici},
Pages = {128-136},
Abstract = {This paper describes an application of machine learning to student modelling. Unlike previous machine learning approaches to student modelling, the new approach is based on attribute-value machine learning. In contrast to many previous approaches it is not necessary for the lesson author to identify all forms of error that may be detected or to identify the possible approaches to problem solving in the domain that may be adopted. Rather, the lesson author need only identify the relevant attributes both of the tasks to be performed by the student and of the student's actions. The values of these attributes are automatically processed by the student modeler to produce the student model.},
Audit-trail = {Paper scanned and converted to word. PDF now up},
Keywords = {Feature Based Modeling and Computer Based Learning},
Location = {Sydney, Australia},
Related = {feature-based-modeling}
}
ABSTRACT This paper describes an application of machine learning to student modelling. Unlike previous machine learning approaches to student modelling, the new approach is based on attribute-value machine learning. In contrast to many previous approaches it is not necessary for the lesson author to identify all forms of error that may be detected or to identify the possible approaches to problem solving in the domain that may be adopted. Rather, the lesson author need only identify the relevant attributes both of the tasks to be performed by the student and of the student's actions. The values of these attributes are automatically processed by the student modeler to produce the student model.

Smith, P. A., & Webb, G. I. (1991). Debugging Using Partial Models. Simulation & Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91), Launceston, pp. 581-590.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{SmithWebb91,
Title = {Debugging Using Partial Models},
Author = {P. A. Smith and G. I. Webb},
Booktitle = {Simulation \& Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91)},
Year = {1991},
Address = {Launceston},
Editor = {R. Godfrey},
Pages = {581-590},
Publisher = {University of Tasmania},
Abstract = {We present initial work on an expert system that will help users to debug their Pascal programs. This system asks the user questions concerning attempts to build a 'partial model' of the program - a model of those aspects of the program likely to relate to the error. This contrasts with previous systems in which a complete model of the user's program is built and compared to templates of correct versions of the program. The advantages of this approach are greater flexibility, greater student involvement in the debugging process and lower computational overheads. },
Keywords = {Program Visualisation},
Location = {Launceston, TAS, Australia},
Related = {program-visualisation}
}
ABSTRACT We present initial work on an expert system that will help users to debug their Pascal programs. This system asks the user questions concerning attempts to build a 'partial model' of the program - a model of those aspects of the program likely to relate to the error. This contrasts with previous systems in which a complete model of the user's program is built and compared to templates of correct versions of the program. The advantages of this approach are greater flexibility, greater student involvement in the debugging process and lower computational overheads.

Webb, G. I., & Agar, J. (1991). The Application of Machine Learning to the Diagnosis of Glomerular Disease. Proceedings of the IJCAI Workshop W.15: Representing Knowledge in Medical Decision Support Systems, pp. 8.1-8.8.
[PDF] [Bibtex] [Abstract]

@InProceedings{WebbAgar91,
Title = {The Application of Machine Learning to the Diagnosis of Glomerular Disease},
Author = {G.I. Webb and J. Agar},
Booktitle = {Proceedings of the {IJCAI} Workshop W.15: Representing Knowledge in Medical Decision Support Systems},
Year = {1991},
Editor = {C. Sarmeinto},
Pages = {8.1-8.8},
Abstract = {A pilot study has applied the DLG machine learning algorithm to create expert systems for the assessment and interpretation of clinical and laboratory data in glomerular disease. Despite the limited size of the data-set and major deficiencies in the information recorded therein, for one of the conditions examined in this study, microscopic polyarteritis, a consistent diagnostic accuracy of 100% was obtained. With expansion of the data base, it is possible that techniques will be derived that provide accurate non-invasive diagnosis of some cases of glomerular disease, thus obviating the need for renal biopsy. Success in this project will result in significant reductions in both the cost and the morbidity associated with the investigation of glomerular disease.},
Audit-trail = {Reconstructed paper posted May 2006},
Keywords = {Rule Learning},
Location = {Sydney, Australia}
}
ABSTRACT A pilot study has applied the DLG machine learning algorithm to create expert systems for the assessment and interpretation of clinical and laboratory data in glomerular disease. Despite the limited size of the data-set and major deficiencies in the information recorded therein, for one of the conditions examined in this study, microscopic polyarteritis, a consistent diagnostic accuracy of 100% was obtained. With expansion of the data base, it is possible that techniques will be derived that provide accurate non-invasive diagnosis of some cases of glomerular disease, thus obviating the need for renal biopsy. Success in this project will result in significant reductions in both the cost and the morbidity associated with the investigation of glomerular disease.

Kuzmycz, M., & Webb, G. I. (1991). Modelling Elementary Subtraction: Intelligent Warfare Against Bugs. Simulation & Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91), Launceston, pp. 367-376.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{KuzmyczWebb91,
Title = {Modelling Elementary Subtraction: Intelligent Warfare Against Bugs},
Author = {M. Kuzmycz and G. I. Webb},
Booktitle = {Simulation \& Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91)},
Year = {1991},
Address = {Launceston},
Editor = {R. Godfrey},
Pages = {367-376},
Publisher = {University of Tasmania},
Abstract = {This paper discusses an intelligent system .that uses Input/ Output Cognitive Modelling (IOCM) techniques to form a model of the student. The paper describes FBM, an IOCM system that uses features to represent the inputs and outputs of the tasks being presented to the student and forms a relationship which describes in essence the knowledge the student has in the domain. Also presented is ASPMoRe, an intelligent tool that takes the model of the student and adapts the lesson to both refine the model and give the student practice in weak areas of his knowledge. Results have shown that the system can be an effective tool for educational purposes.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Computer Based Learning},
Location = {Launceston, TAS, Australia},
Related = {feature-based-modeling}
}
ABSTRACT This paper discusses an intelligent system .that uses Input/ Output Cognitive Modelling (IOCM) techniques to form a model of the student. The paper describes FBM, an IOCM system that uses features to represent the inputs and outputs of the tasks being presented to the student and forms a relationship which describes in essence the knowledge the student has in the domain. Also presented is ASPMoRe, an intelligent tool that takes the model of the student and adapts the lesson to both refine the model and give the student practice in weak areas of his knowledge. Results have shown that the system can be an effective tool for educational purposes.

Webb, G. I. (1991). Inside the Unification Tutor: The Architecture of an Intelligent Educational System. Simulation & Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91), Launceston, pp. 677-684.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb91b,
Title = {Inside the Unification Tutor: The Architecture of an Intelligent Educational System},
Author = {G. I. Webb},
Booktitle = {Simulation \& Academic Gaming in Tertiary Education, The Proceedings of the Eighth Annual Conference of ASCILITE (ASCILITE '91)},
Year = {1991},
Address = {Launceston},
Editor = {R. Godfrey},
Pages = {677-684},
Publisher = {University of Tasmania},
Abstract = {The Unification Tutor provides practice and tuition on the unification of terms from the Prolog programming language. It integrates multiple knowledge sources encompassing both performance and declarative knowledge. A key feature of the tutor is the use of a detailed student model. It has been used since 1989 in Computer Science courses at Deakin and La Trobe Universities. Previous papers have examined the student modelling component of this system. This paper investigates the internal operation of the Unification Tutor, the sub-systems it incorporates and their interaction.},
Audit-trail = {Reconstructed paper posted Oct 05},
Keywords = {Feature Based Modeling and Computer Based Learning},
Related = {feature-based-modeling}
}
ABSTRACT The Unification Tutor provides practice and tuition on the unification of terms from the Prolog programming language. It integrates multiple knowledge sources encompassing both performance and declarative knowledge. A key feature of the tutor is the use of a detailed student model. It has been used since 1989 in Computer Science courses at Deakin and La Trobe Universities. Previous papers have examined the student modelling component of this system. This paper investigates the internal operation of the Unification Tutor, the sub-systems it incorporates and their interaction.

Webb, G. I. (1991). Data Driven Inductive Refinement of Production Rules. Proceedings of the First Australian Workshop on Knowledge Acquisition for Knowledge-Based Systems (AKAW '91), Sydney, pp. 44-52.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb91d,
Title = {Data Driven Inductive Refinement of Production Rules},
Author = {G. I. Webb},
Booktitle = {Proceedings of the First Australian Workshop on Knowledge Acquisition for Knowledge-Based Systems (AKAW '91)},
Year = {1991},
Address = {Sydney},
Editor = {R. Quinlan },
Pages = {44-52},
Publisher = {University of Sydney Press.},
Abstract = {This paper presents algorithms for inductive refinement of production rules based on the DLG data-driven machine learning algorithm. These algorithms modify the input production rules with reference to a set of examples so as to ensure that all positive examples are covered and no negative examples are covered. The input production rules may either have been previously learnt by a machine learning system or be extracted from an existing expert system.},
Audit-trail = {Reconstructed paper posted October 2005},
Keywords = {Rule Learning},
Location = {Pokolbin, NSW, Australia}
}
ABSTRACT This paper presents algorithms for inductive refinement of production rules based on the DLG data-driven machine learning algorithm. These algorithms modify the input production rules with reference to a set of examples so as to ensure that all positive examples are covered and no negative examples are covered. The input production rules may either have been previously learnt by a machine learning system or be extracted from an existing expert system.

Webb, G. I. (1991). Einstein: An Interactive Inductive Knowledge-Acquisition Tool. Proceedings of the Sixth Banff Knowledge Acquisition for Knowledge-Based Systems Workshop, pp. (36)1-16.
[Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb91c,
Title = {Einstein: An Interactive Inductive Knowledge-Acquisition Tool},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Sixth Banff Knowledge Acquisition for Knowledge-Based Systems Workshop},
Year = {1991},
Pages = {(36)1-16},
Abstract = {Einstein is a knowledge acquisition system that incorporates data-driven inductive rule development and refinement in a user driven production rule development and evaluation environment. This allows the user and the induction system to interact as a cooperative knowledge acquisition team. Unique features of this system include efficient automated inductive refinement of existing production rules, interactive user management of machine learning facilities, including local and global guidance, interactive specification of key examples and counter-examples and interactive case-based rule assessment.},
Audit-trail = {*},
Keywords = {Machine Learning with Knowledge Acquisition from Experts},
Location = {Banff, Canada},
Related = {interactive-machine-learning}
}
ABSTRACT Einstein is a knowledge acquisition system that incorporates data-driven inductive rule development and refinement in a user driven production rule development and evaluation environment. This allows the user and the induction system to interact as a cooperative knowledge acquisition team. Unique features of this system include efficient automated inductive refinement of existing production rules, interactive user management of machine learning facilities, including local and global guidance, interactive specification of key examples and counter-examples and interactive case-based rule assessment.

Sanzogni, L., Surruwerra, F., & Webb, G. I. (1990). Improving the Efficiency of Rule Based Expert Systems by Rule Activation. Journal of Experimental and Theoretical Artificial Intelligence, 2, 369-380.
[PDF] [Bibtex] [Abstract]

@Article{SanzogniSurruwerraWebb90,
Title = {Improving the Efficiency of Rule Based Expert Systems by Rule Activation},
Author = {L. Sanzogni and F. Surruwerra and G.I. Webb},
Journal = {Journal of Experimental and Theoretical Artificial Intelligence},
Year = {1990},
Pages = {369-380},
Volume = {2},
Abstract = {In this paper we test a hypothesis that has shown promise in enhancing the efficiency (run-time) of rule-based systems. The results of our experiments suggest that the use of rule activation plays an active part in improving the performance of rule bases containing conflict sets.},
Audit-trail = {Reconstructed paper posted Nov 2005},
Publisher = {Taylor and Francis}
}
ABSTRACT In this paper we test a hypothesis that has shown promise in enhancing the efficiency (run-time) of rule-based systems. The results of our experiments suggest that the use of rule activation plays an active part in improving the performance of rule bases containing conflict sets.

Webb, G. I., Cumming, G., Richards, T., & Yum, K-K. (1990). Educational Evaluation of Feature Based Modelling in a Problem Solving Domain. Proceedings of the IFIP TC3 International Conference on Advanced Research on Computers in Education (ARCE'90), Amsterdam, pp. 101-108.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbCummingRichardsYum90,
Title = {Educational Evaluation of Feature Based Modelling in a Problem Solving Domain},
Author = {G. I. Webb and G. Cumming and T. Richards and K-K. Yum},
Booktitle = {Proceedings of the IFIP TC3 International Conference on Advanced Research on Computers in Education (ARCE'90)},
Year = {1990},
Address = {Amsterdam},
Editor = {R. Lewis and S. Otsuki},
Pages = {101-108},
Publisher = {Elsevier},
Abstract = {Feature-Based Modelling is a machine learning based cognitive modelling methodology. An intelligent educational system has been implemented, for the purpose of evaluating the methodology, which helps students learn about the unification of terms from the Prolog programming language. The system has been used by Third Year Computer Science students at La Trobe University during September 1989. Students were randomly allocated to an Experimental condition, in which FBM modelling was used to select tasks, and give extra comments, or to a Control condition in which similar tasks and comments were given, but without FBM tailoring to the individual. Ratings of task appropriateness, and comment usefulness, were collected on-line as the students worked with the tutor; overall ratings were obtained by questionnaire at the end; and semester exam results were examined. Despite the fact that only a minority of students showed sufficient misunderstanding for FBM to have potential value, of the ten comparisons chat relate most directly to the aims of the Tutor, while in no case reaching significance, seven were in favour of the Tutor, and only two against. These preliminary results are very encouraging for the FBM principles of the Tutor. },
Audit-trail = {Pre-pub pdf posted 26/5/05},
Keywords = {Feature Based Modeling and Computer Based Learning},
Location = {Tokyo, Japan},
Related = {feature-based-modeling}
}
ABSTRACT Feature-Based Modelling is a machine learning based cognitive modelling methodology. An intelligent educational system has been implemented, for the purpose of evaluating the methodology, which helps students learn about the unification of terms from the Prolog programming language. The system has been used by Third Year Computer Science students at La Trobe University during September 1989. Students were randomly allocated to an Experimental condition, in which FBM modelling was used to select tasks, and give extra comments, or to a Control condition in which similar tasks and comments were given, but without FBM tailoring to the individual. Ratings of task appropriateness, and comment usefulness, were collected on-line as the students worked with the tutor; overall ratings were obtained by questionnaire at the end; and semester exam results were examined. Despite the fact that only a minority of students showed sufficient misunderstanding for FBM to have potential value, of the ten comparisons chat relate most directly to the aims of the Tutor, while in no case reaching significance, seven were in favour of the Tutor, and only two against. These preliminary results are very encouraging for the FBM principles of the Tutor.

Webb, G. I. (1990). Rule Optimisation and Theory Optimisation: Heuristic Search Strategies for Data-Driven Machine Learning. Proceedings of the First Japanese Knowledge Acquisition for Knowledge-Based Systems Workshop (JKAW'90), Tokyo, pp. 219-232.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb90,
Title = {Rule Optimisation and Theory Optimisation: Heuristic Search Strategies for Data-Driven Machine Learning},
Author = {G. I. Webb},
Booktitle = {Proceedings of the First Japanese Knowledge Acquisition for Knowledge-Based Systems Workshop (JKAW'90)},
Year = {1990},
Address = {Tokyo},
Editor = {H. Motada and R. Mizoguchi and J. Boose and B. Gaines},
Pages = {219-232},
Publisher = {IOS Press},
Abstract = {Previous implementations of the Aq algorithm have used rule optimisation search strategies to attempt to develop optimal classification procedures. These strategies involve generating successive characteristic descriptions each of which is individually of maximal value. This is contrasted with theory optimisation search strategies which, instead, generate successive complete classification procedures from which those with the maximal value are selected. These two strategies have been applied to the domain of the diagnosis of Immunoglobulin A Nephropathy disease. The theory optimisation strategy was observed to out perform the rule optimisation strategy.},
Audit-trail = {Reconstructed paper posted},
Keywords = {Rule Learning},
Location = {Kyoto, Japan}
}
ABSTRACT Previous implementations of the Aq algorithm have used rule optimisation search strategies to attempt to develop optimal classification procedures. These strategies involve generating successive characteristic descriptions each of which is individually of maximal value. This is contrasted with theory optimisation search strategies which, instead, generate successive complete classification procedures from which those with the maximal value are selected. These two strategies have been applied to the domain of the diagnosis of Immunoglobulin A Nephropathy disease. The theory optimisation strategy was observed to out perform the rule optimisation strategy.

Webb, G. I., Cumming, G., Richards, T., & Yum, K-K. (1989). The Unification Tutor: An Intelligent Educational System in the Classroom. Proceedings of the Seventh Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE '89), Gold Coast, pp. 408-420.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{WebbCummingRichardsYum89,
Title = {The Unification Tutor: An Intelligent Educational System in the Classroom},
Author = {G. I. Webb and G. Cumming and T. Richards and K-K. Yum},
Booktitle = {Proceedings of the Seventh Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE '89)},
Year = {1989},
Address = {Gold Coast},
Editor = {G. Bishop and J. Baker},
Pages = {408-420},
Publisher = {Bond University},
Abstract = {The Unification Tutor is experimental Intelligent Tutoring System for the domain of the unification of Prolog terms. It demonstrates the interactive use of Feature-Based Modelling - an approach to cognitive modelling that has been presented at previous ASCILITE Conferences (Webb, 1988b.) The Unification Tutor has been used by Third Year Computer Science students at La Trobe University during September 1989. This paper describes the Unification Tutor and evaluates its performance at La Trobe.},
Audit-trail = {Reconstructed paper posted Nov 05},
Keywords = {Feature Based Modeling and Computer Based Learning and Computer Science Education},
Location = {Gold Coast, QLD, Australia},
Related = {feature-based-modeling}
}
ABSTRACT The Unification Tutor is experimental Intelligent Tutoring System for the domain of the unification of Prolog terms. It demonstrates the interactive use of Feature-Based Modelling - an approach to cognitive modelling that has been presented at previous ASCILITE Conferences (Webb, 1988b.) The Unification Tutor has been used by Third Year Computer Science students at La Trobe University during September 1989. This paper describes the Unification Tutor and evaluates its performance at La Trobe.

Webb, G. I. (1989). Courseware Abstraction: Reducing Development Costs While Producing Qualitative Improvements in CAL. Journal of Computer Assisted Learning, 5, 103-113.
[PDF] [Bibtex] [Abstract]

@Article{Webb89b,
Title = {Courseware Abstraction: Reducing Development Costs While Producing Qualitative Improvements in CAL},
Author = {G. I. Webb},
Journal = {Journal of Computer Assisted Learning},
Year = {1989},
Pages = {103-113},
Volume = {5},
Abstract = {Courseware abstraction is an approach to CAL whereby the lesson author creates a general parametetized CAL lesson that is then applied to many concrete examples. This approach has the following advantages over alternative approaches to lesson development: it is cost efficient; it facilitates lesson verification; it encourages the provision of as many examples as are desirable; it simplifies the selection of appropriate examples for presentation to each student; it provides a convenient framework for student evaluation, and it supports the development of factually exhaustive lessons. In short it provides qualitative improvements, while at the same time reducing lesson development costs. Although widely used, courseware abstraction has not previously been identified as an important CAL technique and its relative merits have never received attention. In particular, there has been a failure to recognize that generative CAL derives most of its power from the use of courseware abstraction.},
Audit-trail = {28/10/03 Not available online. Have permission to post paper but no PDF available. NOvember 2005 - posted reconstructed paper.},
Keywords = {Computer Based Learning},
Publisher = {Blackwell Publishing}
}
ABSTRACT Courseware abstraction is an approach to CAL whereby the lesson author creates a general parametetized CAL lesson that is then applied to many concrete examples. This approach has the following advantages over alternative approaches to lesson development: it is cost efficient; it facilitates lesson verification; it encourages the provision of as many examples as are desirable; it simplifies the selection of appropriate examples for presentation to each student; it provides a convenient framework for student evaluation, and it supports the development of factually exhaustive lessons. In short it provides qualitative improvements, while at the same time reducing lesson development costs. Although widely used, courseware abstraction has not previously been identified as an important CAL technique and its relative merits have never received attention. In particular, there has been a failure to recognize that generative CAL derives most of its power from the use of courseware abstraction.

Webb, G. I. (1989). A Machine Learning Approach to Student Modelling. Proceedings of the Third Australian Joint Conference on Artificial Intelligence (AI 89), pp. 195-205.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb89a,
Title = {A Machine Learning Approach to Student Modelling},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Third Australian Joint Conference on Artificial Intelligence (AI 89)},
Year = {1989},
Pages = {195-205},
Abstract = {This paper describes an application of established machine learning principles to student modelling. Unlike previous machine learning based approaches to student modelling, the new approach is based on attribute-value machine learning. In contrast to many previous approaches it is not necessary for the lesson author to identify all forms of error that may be detected. Rather, the lesson author need only identify the relevant attributes both of the tasks to be performed by the student and of the student's actions. The values of these attributes are automatically processed by the student modeler to produce the student model.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Computer Based Learning},
Location = {Melbourne, Australia},
Related = {feature-based-modeling}
}
ABSTRACT This paper describes an application of established machine learning principles to student modelling. Unlike previous machine learning based approaches to student modelling, the new approach is based on attribute-value machine learning. In contrast to many previous approaches it is not necessary for the lesson author to identify all forms of error that may be detected. Rather, the lesson author need only identify the relevant attributes both of the tasks to be performed by the student and of the student's actions. The values of these attributes are automatically processed by the student modeler to produce the student model.

Richards, T., Webb, G. I., & Craske, N. (1988). Object-oriented Control for Intelligent Computer Assisted Learning Systems. Proceedings of the IFIP TC3 Working Conference on Artificial Intelligence Tools in Education, North-Holland, Amsterdam, pp. 203-219.
[PDF] [Bibtex] [Abstract]

@InProceedings{RichardsWebbCraske88,
Title = {Object-oriented Control for Intelligent Computer Assisted Learning Systems},
Author = {T. Richards and G. I. Webb and N. Craske},
Booktitle = {Proceedings of the IFIP TC3 Working Conference on Artificial Intelligence Tools in Education},
Year = {1988},
Address = {North-Holland, Amsterdam},
Editor = {P. Ercoli and R. Lewis},
Pages = {203-219},
Publisher = {Elsevier},
Abstract = {This paper investigates an approach to providing a general-purpose authoring/tutoring shell for intelligent computer assisted learning systems. The approach is to outline an object-oriented representation of task/goal hierarchies, then to consider the ways in which domain expertise, student information and teacher expertise can be made to interact with such hierarchies. The result is a skeleton in terms of which exploratory lessons are being constructed; and in terms of which further research on the domain expert system, student modelling, educational expertise modelling, and user interfacing can more concretely be developed.},
Audit-trail = {Reconstructed paper posted Nov 05},
Keywords = {Computer Based Learning},
Location = {Frascati, Italy}
}
ABSTRACT This paper investigates an approach to providing a general-purpose authoring/tutoring shell for intelligent computer assisted learning systems. The approach is to outline an object-oriented representation of task/goal hierarchies, then to consider the ways in which domain expertise, student information and teacher expertise can be made to interact with such hierarchies. The result is a skeleton in terms of which exploratory lessons are being constructed; and in terms of which further research on the domain expert system, student modelling, educational expertise modelling, and user interfacing can more concretely be developed.

Webb, G. I. (1988). Techniques for Efficient Empirical Induction. Lecture Notes in Artificial Intelligence Vol. 406: Proceedings of the Second Australian Joint Conference on Artificial Intelligence (AI'88), Berlin, pp. 225-239.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb88b,
Title = {Techniques for Efficient Empirical Induction},
Author = {G. I. Webb},
Booktitle = {Lecture Notes in Artificial Intelligence Vol. 406: Proceedings of the Second Australian Joint Conference on Artificial Intelligence (AI'88)},
Year = {1988},
Address = {Berlin},
Editor = {C. J. Barter and M. J. Brooks},
Pages = {225-239},
Publisher = {Springer-Verlag},
Abstract = {This paper describes the LEI algorithm for empirical induction. The LEI algorithm provides efficient empirical induction for discrete attribute value data. It derives a classification procedure in the form of a set of predicate logic classification rules. This contrasts with the only other efficient approach to exhaustive empirical induction, the derivatives of the CLS algorithm, which present their classification procedures in the form of a decision tree. The LEI algorithm will always find the simplest non-disjunctive rule that correctly classifies all examples of a single class where such a rule exists.},
Audit-trail = {Reconstructed paper posted 6/6/05},
Keywords = {Rule Learning},
Location = {Adelaide, S.A., Australia}
}
ABSTRACT This paper describes the LEI algorithm for empirical induction. The LEI algorithm provides efficient empirical induction for discrete attribute value data. It derives a classification procedure in the form of a set of predicate logic classification rules. This contrasts with the only other efficient approach to exhaustive empirical induction, the derivatives of the CLS algorithm, which present their classification procedures in the form of a decision tree. The LEI algorithm will always find the simplest non-disjunctive rule that correctly classifies all examples of a single class where such a rule exists.

Webb, G. I. (1988). Cognitive Diagnosis Using Student Attributions. Computers in Learning in Tertiary education: Proceedings of the Sixth Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE-88), pp. 502-514.
[PDF] [Bibtex] [Abstract]  → Related papers and software

@InProceedings{Webb88a,
Title = {Cognitive Diagnosis Using Student Attributions},
Author = {G. I. Webb},
Booktitle = {Computers in Learning in Tertiary education: Proceedings of the Sixth Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE-88)},
Year = {1988},
Editor = {K. Fielden and F. Hicks and N. Scott},
Pages = {502-514},
Abstract = {This paper details an approach to cognitive diagnosis that enables the inference of detailed models of a student's conceptualisation of a domain. This model is constructed be examining the attributes of the problems that the student has tackled and the student's performance while tackling those problems. A feature network is used to represent educationally relevant domain knowledge. This approach has low implementation and operational overheads; It provides a detailed model of the student's conceptualisation of the subject domain in terms of elements of knowledge from that domain; Student models are not restricted to overlays of predefined correct and/or incorrect knowledge; It does not require that the instructional designer anticipate the possible forms of error that may occur; It is robust in the face of partial evaluation of student performance; It is also robust in the face of the instructional designer's failure to incorporate relevant aspects of the subject domain in the knowledge-base; The student models can be executed; It supports accurate diagnosis of multiple viewpoints of the domain even when those viewpoints are not anticipated by the instructional designer; It can support multiple teaching styles in the one lesson.},
Audit-trail = {*},
Keywords = {Feature Based Modeling and Computer Based Learning},
Location = {Canberra, Australia},
Related = {feature-based-modeling}
}
ABSTRACT This paper details an approach to cognitive diagnosis that enables the inference of detailed models of a student's conceptualisation of a domain. This model is constructed be examining the attributes of the problems that the student has tackled and the student's performance while tackling those problems. A feature network is used to represent educationally relevant domain knowledge. This approach has low implementation and operational overheads; It provides a detailed model of the student's conceptualisation of the subject domain in terms of elements of knowledge from that domain; Student models are not restricted to overlays of predefined correct and/or incorrect knowledge; It does not require that the instructional designer anticipate the possible forms of error that may occur; It is robust in the face of partial evaluation of student performance; It is also robust in the face of the instructional designer's failure to incorporate relevant aspects of the subject domain in the knowledge-base; The student models can be executed; It supports accurate diagnosis of multiple viewpoints of the domain even when those viewpoints are not anticipated by the instructional designer; It can support multiple teaching styles in the one lesson.

Webb, G. I. (1988). A Knowledge-Based Approach To Computer-Aided Learning. International Journal of Man-Machine Studies, 29, 257-285.
[PDF] [Bibtex] [Abstract]

@Article{Webb88c,
Title = {A Knowledge-Based Approach To Computer-Aided Learning},
Author = {G. I. Webb},
Journal = {International Journal of Man-Machine Studies},
Year = {1988},
Pages = {257-285},
Volume = {29},
Abstract = {This paper describes a methodology for the creation of knowledge-based computer- aided learning lessons. Unlike previous approaches, the knowledge base is utilized only for restricted aspects of the lesson - both for the management of flow of control through a body of instructional materials and for the evaluation of the student's understanding of the subject matter. This has many advantages. While the approach has lower developmental and operational overheads than alternatives it is also able to perform far more flexible evaluations of the student's performance. As flow of control is managed by a knowledge-based component with reference to a detailed analysis of the student's understanding of the subject matter, lessons adapt to each student's individual understanding and aptitude within a domain. },
Audit-trail = {Only on-line since 1993 via Science Direct. Aug 2005 pre-pub posted},
Keywords = {Computer Based Learning},
Publisher = {Academic Press}
}
ABSTRACT This paper describes a methodology for the creation of knowledge-based computer- aided learning lessons. Unlike previous approaches, the knowledge base is utilized only for restricted aspects of the lesson - both for the management of flow of control through a body of instructional materials and for the evaluation of the student's understanding of the subject matter. This has many advantages. While the approach has lower developmental and operational overheads than alternatives it is also able to perform far more flexible evaluations of the student's performance. As flow of control is managed by a knowledge-based component with reference to a detailed analysis of the student's understanding of the subject matter, lessons adapt to each student's individual understanding and aptitude within a domain.

Webb, G. I. (1987). Domain and Tutoring Knowledge in Computer-Aided Learning. Proceedings of the First Australian Joint Conference on Artificial Intelligence (AI'87), Sydney, pp. 488-502.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb87b,
Title = {Domain and Tutoring Knowledge in Computer-Aided Learning},
Author = {G. I. Webb},
Booktitle = {Proceedings of the First Australian Joint Conference on Artificial Intelligence (AI'87)},
Year = {1987},
Address = {Sydney},
Editor = {J. Gero and F. Sudweeks},
Pages = {488-502},
Publisher = {The University of Sydney Printing Service},
Abstract = {Previous approaches to the utilisation of expertise in computer-aided learning have emphasised expertise in the subject domain. By contrast, this paper details an approach that emphasises tutoring expertise and only relies on minimal domain expertise. This has several advantages. The intelligent use of restricted domain expertise enables the detailed evaluation of the students' understanding of the domain. This permits the provision of very flexible tuition that uniquely adjusts to each student's understanding of the domain. Further, due to the restricted nature of the domain knowledge that is required, the developmental overheads associated with a lesson are minimal. Finally, the type of domain knowledge required has a well defined semantics further enabling its intelligent manipulation. The approach described is domain independent. This paper describes the general system architecture, the knowledge representation formalism used and the tutoring strategies that are employed.},
Audit-trail = {Reconstructed paper posted},
Keywords = {Computer Based Learning},
Location = {Sydney, Australia}
}
ABSTRACT Previous approaches to the utilisation of expertise in computer-aided learning have emphasised expertise in the subject domain. By contrast, this paper details an approach that emphasises tutoring expertise and only relies on minimal domain expertise. This has several advantages. The intelligent use of restricted domain expertise enables the detailed evaluation of the students' understanding of the domain. This permits the provision of very flexible tuition that uniquely adjusts to each student's understanding of the domain. Further, due to the restricted nature of the domain knowledge that is required, the developmental overheads associated with a lesson are minimal. Finally, the type of domain knowledge required has a well defined semantics further enabling its intelligent manipulation. The approach described is domain independent. This paper describes the general system architecture, the knowledge representation formalism used and the tutoring strategies that are employed.

Webb, G. I. (1987). Generative CAL and Courseware Abstraction. Using computers intelligently in Tertiary Education: Proceedings of the Fifth Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE-87), pp. 257-285.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb87a,
Title = {Generative CAL and Courseware Abstraction},
Author = {G. I. Webb},
Booktitle = {Using computers intelligently in Tertiary Education: Proceedings of the Fifth Annual Conference of the Australian Society for Computers in Learning in Tertiary Education (ASCILITE-87)},
Year = {1987},
Editor = {J. Barrett and J. Hedberg},
Pages = {257-285},
Abstract = {Courseware abstraction is an approach to CAL whereby the lesson author creates a general parameterised CAL lesson that is then applied to many concrete examples. This approach has the following advantages: 1. it provides a powerful framework within which to adapt tuition to a student's knowledge and aptitude; 2. it encourages the development of detailed treatments of the subject matter; it reduces the cost of lesson development as a ratio to student lesson time; 3. and it enables large numbers of examples to be made available for individual students. Generative CAL is an example of courseware abstraction. It is argued that the advantages of generative CAL do not arise directly from the generation of the examples to be examined but rather can be directly attributed to the use of courseware abstraction.},
Audit-trail = {reconstructed paper posted Sept 06},
Keywords = {Computer Based Learning},
Location = {Sydney, Australia}
}
ABSTRACT Courseware abstraction is an approach to CAL whereby the lesson author creates a general parameterised CAL lesson that is then applied to many concrete examples. This approach has the following advantages: 1. it provides a powerful framework within which to adapt tuition to a student's knowledge and aptitude; 2. it encourages the development of detailed treatments of the subject matter; it reduces the cost of lesson development as a ratio to student lesson time; 3. and it enables large numbers of examples to be made available for individual students. Generative CAL is an example of courseware abstraction. It is argued that the advantages of generative CAL do not arise directly from the generation of the examples to be examined but rather can be directly attributed to the use of courseware abstraction.

Webb, G. I. (1986). Knowledge Based Flow of Control in Computer-Aided Learning. Proceedings of the First Australian Artificial Intelligence Congress (1AAIC'86), pp. B: 1-7.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb86,
Title = {Knowledge Based Flow of Control in Computer-Aided Learning},
Author = {G.I. Webb},
Booktitle = {Proceedings of the First Australian Artificial Intelligence Congress (1AAIC'86)},
Year = {1986},
Pages = {B: 1-7},
Abstract = {In this paper I examine the utilisation of knowledge representation in Computer-Aided Learning (CAL) with the aim of establishing knowledge-based CAL techniques that are best suited to current technology. Most existing knowledge-based CAL systems attempt to generate the entire instructional sequence directly from a domain knowledge base. Such systems suffer from several limitations. These limitations include: 1. It is questionable whether the techniques exist to produce such systems for any but a highly restricted set of domains. 2. Even for those domains in which such systems can be produced the overheads are prohibitive for most purposes. Given these limitations, I argue that knowledge representation should be utilised in CAL only for those aspects of the instructional process for which it results in substantial gains without prohibitive overheads. I demonstrate that one aspect of CAL for which this holds is for managing flow of control within instructional material. I provide a detailed description of feature networks. These are a variant of M.A.K. Halliday's system network formalism. Feature networks are a knowledge representation formalism that efficiently encodes exactly the knowledge that is required for knowledge-based flow of control. It is shown that computer based lessons that utilise feature networks for control flow of control are extremely economic in terms of both authoring time and computer resources while providing highly responsive tuition. DABIS, a system that embodies the methodology outlined above, has been implemented and is described. DABIS demonstrates the feasibility of this methodology. There are no fundamental restrictions to the domains to which it can be applied. A lesson created under the DABIS system has been found to have an authoring to student time ration of approximately 12:1. Lessons created by the system also demonstrate the sensitivity to a student's knowledge and abilities within a domain that results from the intelligent use of knowledge-based CAL.},
Audit-trail = {Reconstructed paper posted},
Keywords = {Computer Based Learning},
Location = {Melbourne, Australia}
}
ABSTRACT In this paper I examine the utilisation of knowledge representation in Computer-Aided Learning (CAL) with the aim of establishing knowledge-based CAL techniques that are best suited to current technology. Most existing knowledge-based CAL systems attempt to generate the entire instructional sequence directly from a domain knowledge base. Such systems suffer from several limitations. These limitations include: 1. It is questionable whether the techniques exist to produce such systems for any but a highly restricted set of domains. 2. Even for those domains in which such systems can be produced the overheads are prohibitive for most purposes. Given these limitations, I argue that knowledge representation should be utilised in CAL only for those aspects of the instructional process for which it results in substantial gains without prohibitive overheads. I demonstrate that one aspect of CAL for which this holds is for managing flow of control within instructional material. I provide a detailed description of feature networks. These are a variant of M.A.K. Halliday's system network formalism. Feature networks are a knowledge representation formalism that efficiently encodes exactly the knowledge that is required for knowledge-based flow of control. It is shown that computer based lessons that utilise feature networks for control flow of control are extremely economic in terms of both authoring time and computer resources while providing highly responsive tuition. DABIS, a system that embodies the methodology outlined above, has been implemented and is described. DABIS demonstrates the feasibility of this methodology. There are no fundamental restrictions to the domains to which it can be applied. A lesson created under the DABIS system has been found to have an authoring to student time ration of approximately 12:1. Lessons created by the system also demonstrate the sensitivity to a student's knowledge and abilities within a domain that results from the intelligent use of knowledge-based CAL.

Webb, G. I. (1986). The Domain-Analysis Based Instruction System. Proceedings of the Fourth Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'86), Adelaide, pp. 295-302.
[PDF] [Bibtex] [Abstract]

@InProceedings{Webb86a,
Title = {The Domain-Analysis Based Instruction System},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Fourth Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'86)},
Year = {1986},
Address = {Adelaide},
Editor = {G. Bishop and W. vanLint},
Pages = {295-302},
Publisher = {University of Adelaide},
Abstract = {At the past two CALITE conferences I have described a methodology for creating knowledge-based CAL. This paper describes how that methodology has evolved. The Domain-Analysis Based Instruction System, a CAL system that utilises the methodology is then described in detail.},
Audit-trail = {Reconstructed paper posted Sept 06},
Keywords = {Computer Based Learning},
Location = {Adelaide, Australia}
}
ABSTRACT At the past two CALITE conferences I have described a methodology for creating knowledge-based CAL. This paper describes how that methodology has evolved. The Domain-Analysis Based Instruction System, a CAL system that utilises the methodology is then described in detail.

Richards, T., & Webb, G. I. (1985). ECCLES An Expert System for CAL. Proceedings of the Tenth Western Educational Computing Conference (WECC'85), North Hollywood, CA, pp. 151-157.
[PDF] [Bibtex] [Abstract]

@InProceedings{RichardsWebb85,
Title = {ECCLES An Expert System for CAL},
Author = {T. Richards and G. I. Webb},
Booktitle = {Proceedings of the Tenth Western Educational Computing Conference (WECC'85)},
Year = {1985},
Address = {North Hollywood, CA},
Editor = {H. Garrett},
Pages = {151-157},
Publisher = {: Western Periodicals Company},
Abstract = {An authoring and lesson management system is described for Computer Assisted Learning in which lesson questioning and control flow arising from student response are generated at lesson time from an internal model of the lesson topic area. This approach permits the rapid authoring of conceptually complex lesson material. A languageless menu-driven authoring system rninimises the system familiarization time for lesson authors and enforces the construction of logically complete lessons. From the student's point of view, the system appears as an expert system in the lesson subject area, with precise and detailed knowledge of the topic taught and of the causes of the student's own errors.},
Audit-trail = {Reconstructed paper posted Sept 06},
Keywords = {Computer Based Learning},
Location = {Oakland, CA}
}
ABSTRACT An authoring and lesson management system is described for Computer Assisted Learning in which lesson questioning and control flow arising from student response are generated at lesson time from an internal model of the lesson topic area. This approach permits the rapid authoring of conceptually complex lesson material. A languageless menu-driven authoring system rninimises the system familiarization time for lesson authors and enforces the construction of logically complete lessons. From the student's point of view, the system appears as an expert system in the lesson subject area, with precise and detailed knowledge of the topic taught and of the causes of the student's own errors.

Webb, G. I. (1985). Student Control Under the Feature-Network Based Courseware Design Methodology. Student Control of Learning: Proceedings of the Third Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'85), Melbourne, pp. 27-34.
[Bibtex]

@InProceedings{Webb85a,
Title = {Student Control Under the Feature-Network Based Courseware Design Methodology},
Author = {G. I. Webb},
Booktitle = {Student Control of Learning: Proceedings of the Third Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'85)},
Year = {1985},
Address = {Melbourne},
Editor = {J.A. Bowden \& S. Lichtenstein},
Pages = {27-34},
Publisher = {University of Melbourne},
Audit-trail = {*},
Keywords = {Computer Based Learning},
Location = {Melbourne, Australia}
}
ABSTRACT 

Richards, T., Webb, G. I., & Bodnar, S. (1984). ECCLES An Intelligent C.A.L. System. Proceedings of the Second Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'84), Brisbane, pp. 232-235.
[Bibtex]

@InProceedings{RichardsWebbBodnar84,
Title = {ECCLES An Intelligent C.A.L. System},
Author = {T. Richards and G. I. Webb and S. Bodnar},
Booktitle = {Proceedings of the Second Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'84)},
Year = {1984},
Address = {Brisbane},
Editor = {R. Russell},
Pages = {232-235},
Publisher = {University of Queensland},
Audit-trail = {*},
Keywords = {Computer Based Learning},
Location = {Brisbane, Australia}
}
ABSTRACT 

Webb, G. I. (1984). A Methodology for Intermediate Level Knowledge Representation in CAL. Proceedings of the Second Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'84),, Brisbane, pp. 288-303.
[Bibtex]

@InProceedings{Webb84,
Title = {A Methodology for Intermediate Level Knowledge Representation in CAL},
Author = {G. I. Webb},
Booktitle = {Proceedings of the Second Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'84),},
Year = {1984},
Address = {Brisbane},
Editor = {R. Russell},
Pages = {288-303},
Publisher = {University of Queensland},
Audit-trail = {*},
Keywords = {Computer Based Learning},
Location = {Brisbane, Australia}
}
ABSTRACT 

Richards, T., Hooke, R., & Webb, G. I. (1983). Automatic Authoring of Complex and Analytical Question-Answer Lessons. Proceedings of the First Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'83), Brisbane, pp. 282-293.
[Bibtex]

@InProceedings{RichardsHookeWebb83,
Title = {Automatic Authoring of Complex and Analytical Question-Answer Lessons},
Author = {T. Richards and R. Hooke and G. I. Webb},
Booktitle = {Proceedings of the First Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'83)},
Year = {1983},
Address = {Brisbane},
Editor = {R. Russell},
Pages = {282-293},
Publisher = {University of Queensland},
Keywords = {Computer Based Learning},
Location = {Brisbane, Australia}
}
ABSTRACT 

Webb, G. I. (1983). A Computer Based Language Instructional Expert. Proceedings of the First Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'83), Brisbane, pp. 391-402.
[Bibtex]

@InProceedings{Webb83,
Title = {A Computer Based Language Instructional Expert},
Author = {G. I. Webb},
Booktitle = {Proceedings of the First Annual Computer-Assisted Learning in Tertiary Education Conference (CALITE'83)},
Year = {1983},
Address = {Brisbane},
Editor = {R. Russell},
Pages = {391-402},
Publisher = {University of Queensland},
Audit-trail = {*},
Keywords = {Computer Based Learning},
Location = {Brisbane, Australia}
}
ABSTRACT