Skip to content

Publications with awards

Return to full publication list

exclamation Clarivate Web of Science Highly Cited Paper 2020
DeepCleave: a deep learning predictor for caspase and matrix metalloprotease substrates and cleavage sites.
Li, F., Chen, J., Leier, A., Marquez-Lago, T., Liu, Q., Wang, Y., Revote, J., Smith, I. A., Akutsu, T., Webb, G. I., Kurgan, L., & Song, J.
Bioinformatics, 36(4), 1057-1065, 2020.
[DOI] [Bibtex] [Abstract]

@Article{Li2020a,
author = {Li, Fuyi and Chen, Jinxiang and Leier, Andre and Marquez-Lago, Tatiana and Liu, Quanzhong and Wang, Yanze and Revote, Jerico and Smith, A Ian and Akutsu, Tatsuya and Webb, Geoffrey I and Kurgan, Lukasz and Song, Jiangning},
journal = {Bioinformatics},
title = {DeepCleave: a deep learning predictor for caspase and matrix metalloprotease substrates and cleavage sites},
year = {2020},
issn = {1367-4803},
number = {4},
pages = {1057-1065},
volume = {36},
abstract = {{Proteases are enzymes that cleave target substrate proteins by catalyzing the hydrolysis of peptide bonds between specific amino acids. While the functional proteolysis regulated by proteases plays a central role in the "life and death" process of proteins, many of the corresponding substrates and their cleavage sites were not found yet. Availability of accurate predictors of the substrates and cleavage sites would facilitate understanding of proteases’ functions and physiological roles. Deep learning is a promising approach for the development of accurate predictors of substrate cleavage events.We propose DeepCleave, the first deep learning-based predictor of protease-specific substrates and cleavage sites. DeepCleave uses protein substrate sequence data as input and employs convolutional neural networks with transfer learning to train accurate predictive models. High predictive performance of our models stems from the use of high-quality cleavage site features extracted from the substrate sequences through the deep learning process, and the application of transfer learning, multiple kernels and attention layer in the design of the deep network. Empirical tests against several related state-of-the-art methods demonstrate that DeepCleave outperforms these methods in predicting caspase and matrix metalloprotease substrate-cleavage sites.The DeepCleave webserver and source code are freely available at http://deepcleave.erc.monash.edu/.Supplementary data are available at Bioinformatics online.}},
comment = {Clarivate Web of Science Highly Cited Paper 2020},
doi = {10.1093/bioinformatics/btz721},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT {Proteases are enzymes that cleave target substrate proteins by catalyzing the hydrolysis of peptide bonds between specific amino acids. While the functional proteolysis regulated by proteases plays a central role in the "life and death" process of proteins, many of the corresponding substrates and their cleavage sites were not found yet. Availability of accurate predictors of the substrates and cleavage sites would facilitate understanding of proteases’ functions and physiological roles. Deep learning is a promising approach for the development of accurate predictors of substrate cleavage events.We propose DeepCleave, the first deep learning-based predictor of protease-specific substrates and cleavage sites. DeepCleave uses protein substrate sequence data as input and employs convolutional neural networks with transfer learning to train accurate predictive models. High predictive performance of our models stems from the use of high-quality cleavage site features extracted from the substrate sequences through the deep learning process, and the application of transfer learning, multiple kernels and attention layer in the design of the deep network. Empirical tests against several related state-of-the-art methods demonstrate that DeepCleave outperforms these methods in predicting caspase and matrix metalloprotease substrate-cleavage sites.The DeepCleave webserver and source code are freely available at http://deepcleave.erc.monash.edu/.Supplementary data are available at Bioinformatics online.}

exclamation Clarivate Web of Science Highly Cited Paper 2020, 2021
iLearn: an integrated platform and meta-learner for feature engineering, machine-learning analysis and modeling of DNA, RNA and protein sequence data.
Chen, Z., Zhao, P., Li, F., Marquez-Lago, T. T., Leier, A., Revote, J., Zhu, Y., Powell, D. R., Akutsu, T., Webb, G. I., Chou, K., Smith, I. A., Daly, R. J., Li, J., & Song, J.
Briefings in Bioinformatics, 21(3), 1047-1057, 2020.
[DOI] [Bibtex] [Abstract]

@Article{10.1093/bib/bbz041,
author = {Chen, Zhen and Zhao, Pei and Li, Fuyi and Marquez-Lago, Tatiana T and Leier, Andre and Revote, Jerico and Zhu, Yan and Powell, David R and Akutsu, Tatsuya and Webb, Geoffrey I and Chou, Kuo-Chen and Smith, A Ian and Daly, Roger J and Li, Jian and Song, Jiangning},
journal = {Briefings in Bioinformatics},
title = {iLearn: an integrated platform and meta-learner for feature engineering, machine-learning analysis and modeling of DNA, RNA and protein sequence data},
year = {2020},
issn = {1477-4054},
number = {3},
pages = {1047-1057},
volume = {21},
abstract = {With the explosive growth of biological sequences generated in the post-genomic era, one of the most challenging problems in bioinformatics and computational biology is to computationally characterize sequences, structures and functions in an efficient, accurate and high-throughput manner. A number of online web servers and stand-alone tools have been developed to address this to date; however, all these tools have their limitations and drawbacks in terms of their effectiveness, user-friendliness and capacity. Here, we present iLearn, a comprehensive and versatile Python-based toolkit, integrating the functionality of feature extraction, clustering, normalization, selection, dimensionality reduction, predictor construction, best descriptor/model selection, ensemble learning and results visualization for DNA, RNA and protein sequences. iLearn was designed for users that only want to upload their data set and select the functions they need calculated from it, while all necessary procedures and optimal settings are completed automatically by the software. iLearn includes a variety of descriptors for DNA, RNA and proteins, and four feature output formats are supported so as to facilitate direct output usage or communication with other computational tools. In total, iLearn encompasses 16 different types of feature clustering, selection, normalization and dimensionality reduction algorithms, and five commonly used machine-learning algorithms, thereby greatly facilitating feature analysis and predictor construction. iLearn is made freely available via an online web server and a stand-alone toolkit.},
comment = {Clarivate Web of Science Highly Cited Paper 2020, 2021},
doi = {10.1093/bib/bbz041},
keywords = {Bioinformatics and DP140100087},
related = {computational-biology},
}
ABSTRACT With the explosive growth of biological sequences generated in the post-genomic era, one of the most challenging problems in bioinformatics and computational biology is to computationally characterize sequences, structures and functions in an efficient, accurate and high-throughput manner. A number of online web servers and stand-alone tools have been developed to address this to date; however, all these tools have their limitations and drawbacks in terms of their effectiveness, user-friendliness and capacity. Here, we present iLearn, a comprehensive and versatile Python-based toolkit, integrating the functionality of feature extraction, clustering, normalization, selection, dimensionality reduction, predictor construction, best descriptor/model selection, ensemble learning and results visualization for DNA, RNA and protein sequences. iLearn was designed for users that only want to upload their data set and select the functions they need calculated from it, while all necessary procedures and optimal settings are completed automatically by the software. iLearn includes a variety of descriptors for DNA, RNA and proteins, and four feature output formats are supported so as to facilitate direct output usage or communication with other computational tools. In total, iLearn encompasses 16 different types of feature clustering, selection, normalization and dimensionality reduction algorithms, and five commonly used machine-learning algorithms, thereby greatly facilitating feature analysis and predictor construction. iLearn is made freely available via an online web server and a stand-alone toolkit.

exclamation Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020, 2021
iProt-Sub: a comprehensive package for accurately mapping and predicting protease-specific substrates and cleavage sites.
Song, J., Wang, Y., Li, F., Akutsu, T., Rawlings, N. D., Webb, G. I., & Chou, K.
Briefings in Bioinformatics, 20(2), 638-658, 2019.
[DOI] [Bibtex] [Abstract]

@Article{doi:10.1093/bib/bby028,
author = {Song, Jiangning and Wang, Yanan and Li, Fuyi and Akutsu, Tatsuya and Rawlings, Neil D and Webb, Geoffrey I and Chou, Kuo-Chen},
journal = {Briefings in Bioinformatics},
title = {iProt-Sub: a comprehensive package for accurately mapping and predicting protease-specific substrates and cleavage sites},
year = {2019},
number = {2},
pages = {638-658},
volume = {20},
abstract = {Regulation of proteolysis plays a critical role in a myriad of important cellular processes. The key to better understanding the mechanisms that control this process is to identify the specific substrates that each protease targets. To address this, we have developed iProt-Sub, a powerful bioinformatics tool for the accurate prediction of protease-specific substrates and their cleavage sites. Importantly, iProt-Sub represents a significantly advanced version of its successful predecessor, PROSPER. It provides optimized cleavage site prediction models with better prediction performance and coverage for more species-specific proteases (4 major protease families and 38 different proteases). iProt-Sub integrates heterogeneous sequence and structural features and uses a two-step feature selection procedure to further remove redundant and irrelevant features in an effort to improve the cleavage site prediction accuracy. Features used by iProt-Sub are encoded by 11 different sequence encoding schemes, including local amino acid sequence profile, secondary structure, solvent accessibility and native disorder, which will allow a more accurate representation of the protease specificity of approximately 38 proteases and training of the prediction models. Benchmarking experiments using cross-validation and independent tests showed that iProt-Sub is able to achieve a better performance than several existing generic tools. We anticipate that iProt-Sub will be a powerful tool for proteome-wide prediction of protease-specific substrates and their cleavage sites, and will facilitate hypothesis-driven functional interrogation of protease-specific substrate cleavage and proteolytic events.},
comment = {Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020, 2021},
doi = {10.1093/bib/bby028},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Regulation of proteolysis plays a critical role in a myriad of important cellular processes. The key to better understanding the mechanisms that control this process is to identify the specific substrates that each protease targets. To address this, we have developed iProt-Sub, a powerful bioinformatics tool for the accurate prediction of protease-specific substrates and their cleavage sites. Importantly, iProt-Sub represents a significantly advanced version of its successful predecessor, PROSPER. It provides optimized cleavage site prediction models with better prediction performance and coverage for more species-specific proteases (4 major protease families and 38 different proteases). iProt-Sub integrates heterogeneous sequence and structural features and uses a two-step feature selection procedure to further remove redundant and irrelevant features in an effort to improve the cleavage site prediction accuracy. Features used by iProt-Sub are encoded by 11 different sequence encoding schemes, including local amino acid sequence profile, secondary structure, solvent accessibility and native disorder, which will allow a more accurate representation of the protease specificity of approximately 38 proteases and training of the prediction models. Benchmarking experiments using cross-validation and independent tests showed that iProt-Sub is able to achieve a better performance than several existing generic tools. We anticipate that iProt-Sub will be a powerful tool for proteome-wide prediction of protease-specific substrates and their cleavage sites, and will facilitate hypothesis-driven functional interrogation of protease-specific substrate cleavage and proteolytic events.

exclamation Clarivate Web of Science Highly Cited Paper 2021
Temporal Convolutional Neural Network for the Classification of Satellite Image Time Series.
Pelletier, C., Webb, G. I., & Petitjean, F.
Remote Sensing, 11(5), Art. no. 523, 2019.
[DOI] [Bibtex] [Abstract]

@Article{PelletierEtAl19,
author = {Pelletier, Charlotte and Webb, Geoffrey I. and Petitjean, Francois},
journal = {Remote Sensing},
title = {Temporal Convolutional Neural Network for the Classification of Satellite Image Time Series},
year = {2019},
issn = {2072-4292},
number = {5},
volume = {11},
abstract = {Latest remote sensing sensors are capable of acquiring high spatial and spectral Satellite Image Time Series (SITS) of the world. These image series are a key component of classification systems that aim at obtaining up-to-date and accurate land cover maps of the Earth’s surfaces. More specifically, current SITS combine high temporal, spectral and spatial resolutions, which makes it possible to closely monitor vegetation dynamics. Although traditional classification algorithms, such as Random Forest (RF), have been successfully applied to create land cover maps from SITS, these algorithms do not make the most of the temporal domain. This paper proposes a comprehensive study of Temporal Convolutional Neural Networks (TempCNNs), a deep learning approach which applies convolutions in the temporal dimension in order to automatically learn temporal (and spectral) features. The goal of this paper is to quantitatively and qualitatively evaluate the contribution of TempCNNs for SITS classification, as compared to RF and Recurrent Neural Networks (RNNs) —a standard deep learning approach that is particularly suited to temporal data. We carry out experiments on Formosat-2 scene with 46 images and one million labelled time series. The experimental results show that TempCNNs are more accurate than the current state of the art for SITS classification. We provide some general guidelines on the network architecture, common regularization mechanisms, and hyper-parameter values such as batch size; we also draw out some differences with standard results in computer vision (e.g., about pooling layers). Finally, we assess the visual quality of the land cover maps produced by TempCNNs.},
articlenumber = {523},
comment = {Clarivate Web of Science Highly Cited Paper 2021},
doi = {10.3390/rs11050523},
keywords = {time series, earth observation analytics},
related = {earth-observation-analytics},
}
ABSTRACT Latest remote sensing sensors are capable of acquiring high spatial and spectral Satellite Image Time Series (SITS) of the world. These image series are a key component of classification systems that aim at obtaining up-to-date and accurate land cover maps of the Earth’s surfaces. More specifically, current SITS combine high temporal, spectral and spatial resolutions, which makes it possible to closely monitor vegetation dynamics. Although traditional classification algorithms, such as Random Forest (RF), have been successfully applied to create land cover maps from SITS, these algorithms do not make the most of the temporal domain. This paper proposes a comprehensive study of Temporal Convolutional Neural Networks (TempCNNs), a deep learning approach which applies convolutions in the temporal dimension in order to automatically learn temporal (and spectral) features. The goal of this paper is to quantitatively and qualitatively evaluate the contribution of TempCNNs for SITS classification, as compared to RF and Recurrent Neural Networks (RNNs) —a standard deep learning approach that is particularly suited to temporal data. We carry out experiments on Formosat-2 scene with 46 images and one million labelled time series. The experimental results show that TempCNNs are more accurate than the current state of the art for SITS classification. We provide some general guidelines on the network architecture, common regularization mechanisms, and hyper-parameter values such as batch size; we also draw out some differences with standard results in computer vision (e.g., about pooling layers). Finally, we assess the visual quality of the land cover maps produced by TempCNNs.

exclamation Best Research Paper Award
Efficient search of the best warping window for Dynamic Time Warping.
Tan, C. W., Herrmann, M., Forestier, G., Webb, G. I., & Petitjean, F.
Proceedings of the 2018 SIAM International Conference on Data Mining, pp. 459-467, 2018.
[PDF] [Bibtex] [Abstract]

@InProceedings{TanEtAl18,
Title = {Efficient search of the best warping window for Dynamic Time Warping},
Author = {Tan, Chang Wei and Herrmann, Matthieu and Forestier, Germain and Webb, Geoffrey I. and Petitjean, Francois},
Booktitle = {Proceedings of the 2018 {SIAM} International Conference on Data Mining},
Year = {2018},
Pages = {459-467},
Abstract = {Time series classification maps time series to labels. The nearest neighbour algorithm (NN) using the Dynamic Time Warping (DTW) similarity measure is a leading algorithm for this task and a component of the current best ensemble classifiers for time series. However, NN-DTW is only a winning combination when its meta-parameter  its warping window  is learned from the training data. The warping window (WW) intuitively controls the amount of distortion allowed when comparing a pair of time series. With a training database of N time series of lengths L, a naive approach to learning the WW requires Omega(N2L3) operations. This often translates in NN-DTW requiring days for training on datasets containing a few thousand time series only. In this paper, we introduce FastWWSearch: an efficient and exact method to learn WW. We show on 86 datasets that our method is always faster than the state of the art, with at least one order of magnitude and up to 1000x speed-up.},
Comment = {Best Research Paper Award},
Keywords = {time series},
Related = {scalable-time-series-classifiers}
}
ABSTRACT Time series classification maps time series to labels. The nearest neighbour algorithm (NN) using the Dynamic Time Warping (DTW) similarity measure is a leading algorithm for this task and a component of the current best ensemble classifiers for time series. However, NN-DTW is only a winning combination when its meta-parameter  its warping window  is learned from the training data. The warping window (WW) intuitively controls the amount of distortion allowed when comparing a pair of time series. With a training database of N time series of lengths L, a naive approach to learning the WW requires Omega(N2L3) operations. This often translates in NN-DTW requiring days for training on datasets containing a few thousand time series only. In this paper, we introduce FastWWSearch: an efficient and exact method to learn WW. We show on 86 datasets that our method is always faster than the state of the art, with at least one order of magnitude and up to 1000x speed-up.

exclamation Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020
PREvaIL, an integrative approach for inferring catalytic residues using sequence, structural, and network features in a machine-learning framework.
Song, J., Li, F., Takemoto, K., Haffari, G., Akutsu, T., Chou, K. C., & Webb, G. I.
Journal of Theoretical Biology, 443, 125-137, 2018.
[DOI] [Bibtex]

@Article{SongEtAl18,
author = {Song, J. and Li, F. and Takemoto, K. and Haffari, G. and Akutsu, T. and Chou, K. C. and Webb, G. I.},
journal = {Journal of Theoretical Biology},
title = {PREvaIL, an integrative approach for inferring catalytic residues using sequence, structural, and network features in a machine-learning framework},
year = {2018},
pages = {125-137},
volume = {443},
comment = {Clarivate Web of Science Hot Paper and Highly Cited Paper 2019, 2020},
doi = {10.1016/j.jtbi.2018.01.023},
keywords = {Bioinformatics},
related = {computational-biology},
url = {https://authors.elsevier.com/c/1WWQY57ilzyRc},
}
ABSTRACT 

exclamation Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021
iFeature: a python package and web server for features extraction and selection from protein and peptide sequences.
Chen, Z., Zhao, P., Li, F., Leier, A., Marquez-Lago, T. T., Wang, Y., Webb, G. I., Smith, I. A., Daly, R. J., Chou, K., & Song, J.
Bioinformatics, 2499-2502, 2018.
[DOI] [Bibtex] [Abstract]

@Article{ChenEtAl18,
author = {Chen, Zhen and Zhao, Pei and Li, Fuyi and Leier, Andre and Marquez-Lago, Tatiana T and Wang, Yanan and Webb, Geoffrey I and Smith, A Ian and Daly, Roger J and Chou, Kuo-Chen and Song, Jiangning},
journal = {Bioinformatics},
title = {iFeature: a python package and web server for features extraction and selection from protein and peptide sequences},
year = {2018},
pages = {2499-2502},
abstract = {Structural and physiochemical descriptors extracted from sequence data have been widely used to represent sequences and predict structural, functional, expression and interaction profiles of proteins and peptides as well as DNAs/RNAs. Here, we present iFeature, a versatile Python-based toolkit for generating various numerical feature representation schemes for both protein and peptide sequences. iFeature is capable of calculating and extracting a comprehensive spectrum of 18 major sequence encoding schemes that encompass 53 different types of feature descriptors. It also allows users to extract specific amino acid properties from the AAindex database. Furthermore, iFeature integrates 12 different types of commonly used feature clustering, selection and dimensionality reduction algorithms, greatly facilitating training, analysis and benchmarking of machine-learning models. The functionality of iFeature is made freely available via an online web server and a stand-alone toolkit.},
comment = {Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021},
doi = {10.1093/bioinformatics/bty140},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT Structural and physiochemical descriptors extracted from sequence data have been widely used to represent sequences and predict structural, functional, expression and interaction profiles of proteins and peptides as well as DNAs/RNAs. Here, we present iFeature, a versatile Python-based toolkit for generating various numerical feature representation schemes for both protein and peptide sequences. iFeature is capable of calculating and extracting a comprehensive spectrum of 18 major sequence encoding schemes that encompass 53 different types of feature descriptors. It also allows users to extract specific amino acid properties from the AAindex database. Furthermore, iFeature integrates 12 different types of commonly used feature clustering, selection and dimensionality reduction algorithms, greatly facilitating training, analysis and benchmarking of machine-learning models. The functionality of iFeature is made freely available via an online web server and a stand-alone toolkit.

exclamation Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021
PROSPERous: high-throughput prediction of substrate cleavage sites for 90 proteases with improved accuracy.
Song, J., Li, F., Leier, A., Marquez-Lago, T. T., Akutsu, T., Haffari, G., Chou, K., Webb, G. I., & Pike, R. N.
Bioinformatics, 34(4), 684-687, 2017.
[DOI] [Bibtex]

@Article{Song2017a,
author = {Song, Jiangning and Li, Fuyi and Leier, Andre and Marquez-Lago, Tatiana T and Akutsu, Tatsuya and Haffari, Gholamreza and Chou, Kuo-Chen and Webb, Geoffrey I and Pike, Robert N},
journal = {Bioinformatics},
title = {PROSPERous: high-throughput prediction of substrate cleavage sites for 90 proteases with improved accuracy},
year = {2017},
number = {4},
pages = {684-687},
volume = {34},
comment = {Clarivate Web of Science Highly Cited Paper 2019, 2020, 2021},
doi = {10.1093/bioinformatics/btx670},
keywords = {Bioinformatics},
related = {computational-biology},
}
ABSTRACT 

exclamation Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue
A multiple test correction for streams and cascades of statistical hypothesis tests.
Webb, G. I., & Petitjean, F.
Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16, pp. 1255-1264, 2016.
[PDF] [DOI] [Bibtex] [Abstract]

@InProceedings{WebbPetitjean16,
Title = {A multiple test correction for streams and cascades of statistical hypothesis tests},
Author = {Webb, Geoffrey I. and Petitjean, Francois},
Booktitle = {Proceedings of the ACM SIGKDD Conference on Knowledge Discovery and Data Mining, KDD-16},
Year = {2016},
Pages = {1255-1264},
Publisher = {ACM Press},
Abstract = {Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance.
This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed.
To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models.
We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.},
Comment = {Top reviewer score (4.75/5.0), shortlisted for best paper award and invited to ACM TKDE journal KDD-16 special issue},
Doi = {10.1145/2939672.2939775},
Keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
Related = {statistically-sound-association-discovery},
Url = {http://dl.acm.org/authorize?N19100}
}
ABSTRACT Statistical hypothesis testing is a popular and powerful tool for inferring knowledge from data. For every such test performed, there is always a non-zero probability of making a false discovery, i.e.~rejecting a null hypothesis in error. Familywise error rate (FWER) is the probability of making at least one false discovery during an inference process. The expected FWER grows exponentially with the number of hypothesis tests that are performed, almost guaranteeing that an error will be committed if the number of tests is big enough and the risk is not managed; a problem known as the multiple testing problem. State-of-the-art methods for controlling FWER in multiple comparison settings require that the set of hypotheses be pre-determined. This greatly hinders statistical testing for many modern applications of statistical inference, such as model selection, because neither the set of hypotheses that will be tested, nor even the number of hypotheses, can be known in advance. This paper introduces Subfamilywise Multiple Testing, a multiple-testing correction that can be used in applications for which there are repeated pools of null hypotheses from each of which a single null hypothesis is to be rejected and neither the specific hypotheses nor their number are known until the final rejection decision is completed. To demonstrate the importance and relevance of this work to current machine learning problems, we further refine the theory to the problem of model selection and show how to use Subfamilywise Multiple Testing for learning graphical models. We assess its ability to discover graphical models on more than 7,000 datasets, studying the ability of Subfamilywise Multiple Testing to outperform the state of the art on data with varying size and dimensionality, as well as with varying density and power of the present correlations. Subfamilywise Multiple Testing provides a significant improvement in statistical efficiency, often requiring only half as much data to discover the same model, while strictly controlling FWER.

exclamation Best Research Paper Honorable Mention Award
Scaling log-linear analysis to datasets with thousands of variables.
Petitjean, F., & Webb, G. I.
Proceedings of the 2015 SIAM International Conference on Data Mining, pp. 469-477, 2015.
[URL] [Bibtex] [Abstract]

@InProceedings{PetitjeanWebb15,
author = {Petitjean, F. and Webb, G. I.},
booktitle = {Proceedings of the 2015 {SIAM} International Conference on Data Mining},
title = {Scaling log-linear analysis to datasets with thousands of variables},
year = {2015},
pages = {469-477},
abstract = {Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.},
comment = {Best Research Paper Honorable Mention Award},
keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and Learning from large datasets and DP140100087},
related = {scalable-graphical-modeling},
url = {http://epubs.siam.org/doi/pdf/10.1137/1.9781611974010.53},
}
ABSTRACT Association discovery is a fundamental data mining task. The primary statistical approach to association discovery between variables is log-linear analysis. Classical approaches to log-linear analysis do not scale beyond about ten variables. We have recently shown that, if we ensure that the graph supporting the log-linear model is chordal, log-linear analysis can be applied to datasets with hundreds of variables without sacrificing the statistical soundness [21]. However, further scalability remained limited, because state-of-the-art techniques have to examine every edge at every step of the search. This paper makes the following contributions: 1) we prove that only a very small subset of edges has to be considered at each step of the search; 2) we demonstrate how to efficiently find this subset of edges and 3) we show how to efficiently keep track of the best edges to be subsequently added to the initial model. Our experiments, carried out on real datasets with up to 2000 variables, show that our contributions make it possible to gain about 4 orders of magnitude, making log-linear analysis of datasets with thousands of variables possible in seconds instead of days.

exclamation One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
Dynamic Time Warping Averaging of Time Series Allows Faster and More Accurate Classification.
Petitjean, F., Forestier, G., Webb, G. I., Nicholson, A., Chen, Y., & Keogh, E.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 470-479, 2014.
[PDF] [URL] [Bibtex] [Abstract]

@InProceedings{PetitjeanEtAl14b,
author = {Petitjean, F. and Forestier, G. and Webb, G. I. and Nicholson, A. and Chen, Y. and Keogh, E.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {Dynamic Time Warping Averaging of Time Series Allows Faster and More Accurate Classification},
year = {2014},
pages = {470-479},
abstract = {Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.},
comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
keywords = {time series},
related = {scalable-time-series-classifiers},
url = {http://dx.doi.org/10.1109/ICDM.2014.27},
}
ABSTRACT Recent years have seen significant progress in improving both the efficiency and effectiveness of time series classification. However, because the best solution is typically the Nearest Neighbor algorithm with the relatively expensive Dynamic Time Warping as the distance measure, successful deployments on resource constrained devices remain elusive. Moreover, the recent explosion of interest in wearable devices, which typically have limited computational resources, has created a growing need for very efficient classification algorithms. A commonly used technique to glean the benefits of the Nearest Neighbor algorithm, without inheriting its undesirable time complexity, is to use the Nearest Centroid algorithm. However, because of the unique properties of (most) time series data, the centroid typically does not resemble any of the instances, an unintuitive and underappreciated fact. In this work we show that we can exploit a recent result to allow meaningful averaging of 'warped' times series, and that this result allows us to create ultra-efficient Nearest 'Centroid' classifiers that are at least as accurate as their more lethargic Nearest Neighbor cousins.

exclamation One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue
A Statistically Efficient and Scalable Method for Log-Linear Analysis of High-Dimensional Data.
Petitjean, F., Allison, L., & Webb, G. I.
Proceedings of the 14th IEEE International Conference on Data Mining, pp. 480-489, 2014.
[PDF] [URL] [Bibtex] [Abstract]

@InProceedings{PetitjeanEtAl14a,
author = {Petitjean, F. and Allison, L. and Webb, G. I.},
booktitle = {Proceedings of the 14th {IEEE} International Conference on Data Mining},
title = {A Statistically Efficient and Scalable Method for Log-Linear Analysis of High-Dimensional Data},
year = {2014},
pages = {480-489},
abstract = {Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches -- statistical efficiency -- 2) controls for the risk of false discoveries as well as statistical approaches -- high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer -- computational efficiency.},
comment = {One of nine papers invited to Knowledge and Information Systems journal ICDM-14 special issue},
keywords = {Association Rule Discovery and statistically sound discovery and scalable graphical models and DP140100087},
related = {scalable-graphical-modeling},
url = {http://dx.doi.org/10.1109/ICDM.2014.23},
}
ABSTRACT Log-linear analysis is the primary statistical approach to discovering conditional dependencies between the variables of a dataset. A good log-linear analysis method requires both high precision and statistical efficiency. High precision means that the risk of false discoveries should be kept very low. Statistical efficiency means that the method should discover actual associations with as few samples as possible. Classical approaches to log-linear analysis make use of χ2 tests to control this balance between quality and complexity. We present an information-theoretic approach to log-linear analysis. We show that our approach 1) requires significantly fewer samples to discover the true associations than statistical approaches – statistical efficiency – 2) controls for the risk of false discoveries as well as statistical approaches – high precision - and 3) can perform the discovery on datasets with hundreds of variables on a standard desktop computer – computational efficiency.