@conference {ICBO_2018_2, title = {ICBO_2018_2: Adapting Disease Vocabularies for Curation at the Rat Genome Database}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

The Rat Genome Database (RGD) has been annotating genes, QTLs, and strains to disease terms for over 15 years. During that time the controlled vocabulary used for disease curation has changed a few times. The changes were necessitated because no single vocabulary or ontology was freely accessible and complete enough to cover all of the disease states described in the biomedical literature. The first disease vocabulary used at RGD was the {\textquotedblleft}C{\textquotedblright} branch of the National Library of Medicine{\textquoteright}s Medical Subject Headings (MeSH). By 2011 RGD had switched disease curation to the use of MEDIC (MErged DIsease voCabulary), which is a combination of MeSH and OMIM (Online Mendelian Inheritance in Man) constructed by curators at the Comparative Toxicogenomics Database (CTD). MEDIC was an improvement over MeSH, because of the added coverage of OMIM terms, but it was not long before RGD curators saw the need for more disease terms. So within a couple of years, RGD began to add terms to MEDIC under the guise of the RGD Disease Ontology (RDO). Since RGD assigned a unique ID to every MEDIC term imported from CTD, it was easy to add specially coded IDs to indicate those additional terms from a separate, supplemental file. Meanwhile, the human disease ontology (DO) had slowly been developing and expanding. As early as 2010, members of RGD were contributing to the development of DO. Based on the promise of improvements, it was determined that the Alliance of Genome Resources could use the DO as a unifying disease vocabulary across model organism databases. Despite the improvements in DO, RGD still had more than 1000 custom terms and 3800 MEDIC terms with annotations to deal with if RGD would convert to the use of DO. If RGD mapped those non-DO disease terms to DO, much granularity of meaning would be lost. To avoid the loss of granularity it was decided to extend the DO after import of the merged, already axiomized DO file. So after mapping DO completely to the RGD version of MEDIC, a broader, deeper disease vocabulary has been achieved.

}, keywords = {curation, disease vocabularies, online resource, Rat Genome Database}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_2.pdf }, author = {Stan Laulederkind and G. Thomas Hayman and Shur-Jen Wang and Elizabeth Bolton and Jennifer R. Smith and Marek Tutaj and Jeff de Pons and Mary Shimoyama and Melinda Dwinell} } @conference {ICBO_2018_4, title = {ICBO_2018_4: Comparison of Natural Language Processing Tools for Automatic Gene Ontology Annotation of Scientific Literature}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

Manual curation of scientific literature for ontology-based knowledge representation has proven infeasible and unscalable to the large and growing volume of scientific literature. Automated annotation solutions that leverage text mining and Natural Language Processing (NLP) have been developed to ameliorate the problem of literature curation. These NLP approaches use parsing, syntactical, and lexical analysis of text to recognize and annotate pieces of text with ontology concepts. Here, we conduct a comparison of four state of the art NLP tools at the task of recognizing Gene Ontology concepts from biomedical literature using the Colorado Richly Annotated Full-Text (CRAFT) corpus as a gold standard reference. We demonstrate the use of semantic similarity metrics to compare NLP tool annotations to the gold standard.

}, keywords = {curation, gene ontology, natural language processing, semantic similarity, text mining}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_4.pdf}, author = {Lucas Beasley and Prashanti Manda} } @conference {ICBO_2018_47, title = {ICBO_2018_47: On the statistical sensitivity of semantic similarity metrics}, booktitle = {International Conference on Biomedical Ontology (ICBO 2018)}, series = {Proceedings of the International Conference on Biological Ontology (2018)}, year = {2018}, month = {08/06/2018}, publisher = {International Conference on Biological Ontology}, organization = {International Conference on Biological Ontology}, abstract = {

Measuring the semantic similarity between objects that have been annotated with ontological terms is fundamental to an increasing number of biomedical applications, and several different ontologically-aware semantic similarity metrics are in common use. In some of these applications, only weak semantic similarity is expected for biologically meaningful matches. In such cases, it is important to understand the limits of sensitivity for these metrics, beyond which biologically meaningful matches cannot be reliably distinguished from noise. Here, we present a statistical sensitivity comparison of five common semantic similarity metrics (Jaccard, Resnik, Lin, Jiang \& Conrath, and Hybrid Relative Specificity Similarity) representing three different kinds of metrics (Edge based, Node based, and Hybrid) and four different methods of aggregating individual annotation similarities to estimate similarity between two biological objects - All Pairs, Best Pairs, Best Pairs Symmetric, and Groupwise. We explore key parameter choices that can impact sensitivity. To evaluate sensitivity in a controlled fashion, we explore two different models for simulating data with varying levels of similarity and compare to the noise distribution using resampling. Source data are derived from the Phenoscape Knowledgebase of evolutionary phenotypes. Our results indicate that the choice of similarity metric, along with different parameter choices, can substantially affect sensitivity. Among the five metrics evaluated, we find that Resnik similarity shows the greatest sensitivity to weak semantic similarity. Among the ways to combine pairwise statistics, the Groupwise approach provides the greatest discrimination among values above the sensitivity threshold, while the Best Pairs statistic can be parametrically tuned to provide the highest sensitivity. Our findings serve as a guideline for an appropriate choice and parameterization of semantic similarity metrics, and point to the need for improved reporting of the statistical significance of semantic similarity matches in cases where weak similarity is of interest.

}, keywords = {annotation granularity, curation, Ontology, phenotype, semantic similarity}, url = {http://ceur-ws.org/Vol-2285/ICBO_2018_paper_47.pdf }, author = {Prashanti Manda and Todd Vision} }