@conference {ICBO_2018_18, title = {ICBO_2018_18: Taking a Dive: Experiments in Deep Learning for Automatic Ontology-based Annotation of Scientific Literature}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

Text mining approaches for automated ontology-based curation of biological and biomedical literature have largely focused on syntactic and lexical analysis along with machine learning. Recent advances in deep learning have shown increased accuracy for textual data annotation. However, the application of deep learning for ontology-based curation is a relatively new area and prior work has focused on a limited set of models. Here, we introduce a new deep learning model/architecture based on combining multiple Gated Recurrent Units (GRU) with a character+word based input. We use data from five ontologies in the CRAFT corpus as a Gold Standard to evaluate our model{\textquoteright}s performance. We also compare our model to seven models from prior work. We use four metrics - Precision, Recall, F1 score, and a semantic similarity metric (Jaccard similarity) to compare our model{\textquoteright}s output to the Gold Standard. Our model resulted in 84\% Precision, 84\% Recall, 83\% F1, and 84\% Jaccard similarity. Results show that our GRU-based model outperforms prior models across all five ontologies. We also observed that character+word inputs result in a higher performance across models as compared to word only inputs. These findings indicate that deep learning algorithms are a promising avenue to be explored for automated ontology-based curation of data. This study also serves as a formal comparison and guideline for building and selecting deep learning models and architectures for ontology-based curation.

}, keywords = {automated curation, deep learning, named entity recognition, natural language processing, Ontology}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_18.pdf }, author = {Prashanti Manda and Lucas Beasley and Somya Mohanty} } @conference {ICBO_2018_4, title = {ICBO_2018_4: Comparison of Natural Language Processing Tools for Automatic Gene Ontology Annotation of Scientific Literature}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

Manual curation of scientific literature for ontology-based knowledge representation has proven infeasible and unscalable to the large and growing volume of scientific literature. Automated annotation solutions that leverage text mining and Natural Language Processing (NLP) have been developed to ameliorate the problem of literature curation. These NLP approaches use parsing, syntactical, and lexical analysis of text to recognize and annotate pieces of text with ontology concepts. Here, we conduct a comparison of four state of the art NLP tools at the task of recognizing Gene Ontology concepts from biomedical literature using the Colorado Richly Annotated Full-Text (CRAFT) corpus as a gold standard reference. We demonstrate the use of semantic similarity metrics to compare NLP tool annotations to the gold standard.

}, keywords = {curation, gene ontology, natural language processing, semantic similarity, text mining}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_4.pdf}, author = {Lucas Beasley and Prashanti Manda} } @conference {ICBO_2018_47, title = {ICBO_2018_47: On the statistical sensitivity of semantic similarity metrics}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

Measuring the semantic similarity between objects that have been annotated with ontological terms is fundamental to an increasing number of biomedical applications, and several different ontologically-aware semantic similarity metrics are in common use. In some of these applications, only weak semantic similarity is expected for biologically meaningful matches. In such cases, it is important to understand the limits of sensitivity for these metrics, beyond which biologically meaningful matches cannot be reliably distinguished from noise. Here, we present a statistical sensitivity comparison of five common semantic similarity metrics (Jaccard, Resnik, Lin, Jiang \& Conrath, and Hybrid Relative Specificity Similarity) representing three different kinds of metrics (Edge based, Node based, and Hybrid) and four different methods of aggregating individual annotation similarities to estimate similarity between two biological objects - All Pairs, Best Pairs, Best Pairs Symmetric, and Groupwise. We explore key parameter choices that can impact sensitivity. To evaluate sensitivity in a controlled fashion, we explore two different models for simulating data with varying levels of similarity and compare to the noise distribution using resampling. Source data are derived from the Phenoscape Knowledgebase of evolutionary phenotypes. Our results indicate that the choice of similarity metric, along with different parameter choices, can substantially affect sensitivity. Among the five metrics evaluated, we find that Resnik similarity shows the greatest sensitivity to weak semantic similarity. Among the ways to combine pairwise statistics, the Groupwise approach provides the greatest discrimination among values above the sensitivity threshold, while the Best Pairs statistic can be parametrically tuned to provide the highest sensitivity. Our findings serve as a guideline for an appropriate choice and parameterization of semantic similarity metrics, and point to the need for improved reporting of the statistical significance of semantic similarity matches in cases where weak similarity is of interest.

}, keywords = {annotation granularity, curation, Ontology, phenotype, semantic similarity}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_47.pdf }, author = {Prashanti Manda and Todd Vision} }