@inproceedings{ring2015condist, author = {Ring, Markus and Otto, Florian and Becker, Martin and Niebler, Thomas and Landes, Dieter and Hotho, Andreas}, editor = {ECMLPKDD2015}, interhash = {c062a57a17a0910d6c27ecd664502ac1}, intrahash = {a2f9d649f2856677e4d886a3b517404d}, title = {ConDist: A Context-Driven Categorical Distance Measure}, year = 2015 } @inproceedings{conf/conll/LevyG14, author = {Levy, Omer and Goldberg, Yoav}, booktitle = {CoNLL}, crossref = {conf/conll/2014}, editor = {Morante, Roser and tau Yih, Wen}, ee = {http://aclweb.org/anthology/W/W14/W14-1618.pdf}, interhash = {680dde1fd83a8dd0d6b2619a8266516e}, intrahash = {23bb00b6abab97ed93e74f3b5b148630}, isbn = {978-1-941643-02-0}, pages = {171-180}, publisher = {ACL}, title = {Linguistic Regularities in Sparse and Explicit Word Representations.}, url = {http://dblp.uni-trier.de/db/conf/conll/conll2014.html#LevyG14}, year = 2014 } @inproceedings{Bollegala07semanticSearch, address = {New York, NY, USA}, author = {Bollegala, Danushka and Matsuo, Yutaka and Ishizuka, Mitsuru}, booktitle = {WWW '07: Proceedings of the 16th international conference on World Wide Web}, doi = {http://doi.acm.org/10.1145/1242572.1242675}, interhash = {46247eb09b5e87a6e5d4a8b2cf821ee7}, intrahash = {c957aa2fd65df63c8c4af14b1fc827c5}, isbn = {978-1-59593-654-7}, location = {Banff, Alberta, Canada}, pages = {757--766}, publisher = {ACM}, title = {Measuring semantic similarity between words using web search engines}, year = 2007 } @inproceedings{benz2009characterizing, address = {Bled, Slovenia}, author = {Benz, Dominik and Krause, Beate and Kumar, G. Praveen and Hotho, Andreas and Stumme, Gerd}, booktitle = {Proceedings of the 1st Workshop on Explorative Analytics of Information Networks (EIN2009)}, interhash = {de5e58b26200e44112d9791f39e7523d}, intrahash = {b697a98a7340585594455ee2e81d238a}, month = {September}, title = {Characterizing Semantic Relatedness of Search Query Terms}, year = 2009 } @inproceedings{conf/wise/EdaYY08, author = {Eda, Takeharu and Yoshikawa, Masatoshi and Yamamuro, Masashi}, booktitle = {WISE}, crossref = {conf/wise/2008}, date = {2008-08-25}, editor = {Bailey, James and Maier, David and Schewe, Klaus-Dieter and Thalheim, Bernhard and Wang, Xiaoyang Sean}, ee = {http://dx.doi.org/10.1007/978-3-540-85481-4_13}, interhash = {9d4e2e5c9ea51b5ee850d328eb940524}, intrahash = {a9d9bbe9f365dc1da1df79fffbf95a0d}, isbn = {978-3-540-85480-7}, pages = {151-162}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Locally Expandable Allocation of Folksonomy Tags in a Directed Acyclic Graph.}, url = {http://dblp.uni-trier.de/db/conf/wise/wise2008.html#EdaYY08}, volume = 5175, year = 2008 } @inproceedings{1281204, address = {New York, NY, USA}, author = {Baeza-Yates, Ricardo and Tiberi, Alessandro}, booktitle = {KDD '07: Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {http://doi.acm.org/10.1145/1281192.1281204}, interhash = {26ca034be705abaf072835784f53d877}, intrahash = {6e45b65feffd1545c6dca62bf4b8f53d}, isbn = {978-1-59593-609-7}, location = {San Jose, California, USA}, pages = {76--85}, publisher = {ACM}, title = {Extracting semantic relations from query logs}, url = {http://portal.acm.org/citation.cfm?id=1281204}, year = 2007 } @inproceedings{1135858, abstract = {It has become a promising direction to measure similarity of Web search queries by mining the increasing amount of click-through data logged by Web search engines, which record the interactions between users and the search engines. Most existing approaches employ the click-through data for similarity measure of queries with little consideration of the temporal factor, while the click-through data is often dynamic and contains rich temporal information. In this paper we present a new framework of time-dependent query semantic similarity model on exploiting the temporal characteristics of historical click-through data. The intuition is that more accurate semantic similarity values between queries can be obtained by taking into account the timestamps of the log data. With a set of user-defined calendar schema and calendar patterns, our time-dependent query similarity model is constructed using the marginalized kernel technique, which can exploit both explicit similarity and implicit semantics from the click-through data effectively. Experimental results on a large set of click-through data acquired from a commercial search engine show that our time-dependent query similarity model is more accurate than the existing approaches. Moreover, we observe that our time-dependent query similarity model can, to some extent, reflect real-world semantics such as real-world events that are happening over time.}, address = {New York, NY, USA}, author = {Zhao, Qiankun and Hoi, Steven C. H. and Liu, Tie-Yan and Bhowmick, Sourav S. and Lyu, Michael R. and Ma, Wei-Ying}, booktitle = {WWW '06: Proceedings of the 15th international conference on World Wide Web}, doi = {http://doi.acm.org/10.1145/1135777.1135858}, interhash = {c765e101c37f6b530e2c1c59808048d7}, intrahash = {57cbc64550d3a1b5b8599a0783e95111}, isbn = {1-59593-323-9}, location = {Edinburgh, Scotland}, pages = {543--552}, publisher = {ACM Press}, title = {Time-dependent semantic similarity measure of queries using historical click-through data}, url = {http://portal.acm.org/citation.cfm?id=1135858}, year = 2006 } @article{green99hypertext, author = {Green, S.J.}, interhash = {64375af41410dfef182148d28064dbb5}, intrahash = {eb4a58e09c5bab13b745fa0bea1074a2}, journal = {IEEE Transactions on Knowledge and Data Engineering}, pages = {713--730}, title = {Building Hypertext Links By Computing Semantic Similarity}, volume = 11, year = 1999 } @article{jiang97semantic, author = {Jiang, Jay J. and Conrath, David W.}, ee = {http://arxiv.org/abs/cmp-lg/9709008}, interhash = {175ec03ee8c47d4b2d0a083609a78e05}, intrahash = {9fb00ff398ecc49ee58bfe6ecaaec108}, journal = {CoRR}, title = {Semantic similarity based on corpus statistics and lexical taxonomy}, volume = {cmp-lg/9709008}, year = 1997 } @inproceedings{www200965, abstract = {Social bookmarking systems and their emergent information structures, known as folksonomies, are increasingly important data sources for Semantic Web applications. A key question for harvesting semantics from these systems is how to extend and adapt traditional notions of similarity to folksonomies, and which measures are best suited for applications such as navigation support, semantic search, and ontology learning. Here we build an evaluation framework to compare various general folksonomy-based similarity measures derived from established information-theoretic, statistical, and practical measures. Our framework deals generally and symmetrically with users, tags, and resources. For evaluation purposes we focus on similarity among tags and resources, considering different ways to aggregate annotations across users. After comparing how tag similarity measures predict user-created tag relations, we provide an external grounding by user-validated semantic proxies based on WordNet and the Open Directory. We also investigate the issue of scalability. We ?nd that mutual information with distributional micro-aggregation across users yields the highest accuracy, but is not scalable; per-user projection with collaborative aggregation provides the best scalable approach via incremental computations. The results are consistent across resource and tag similarity.}, author = {Markines, Benjamin and Cattuto, Ciro and Menczer, Filippo and Benz, Dominik and Hotho, Andreas and Stumme, Gerd}, booktitle = {18th International World Wide Web Conference}, interhash = {a266558ad4d83d536a0be2ac94b6b7df}, intrahash = {d16e752a8295d5dad7e26b199d9f614f}, month = {April}, pages = {641--641}, title = {Evaluating Similarity Measures for Emergent Semantics of Social Tagging}, url = {http://www2009.eprints.org/65/}, year = 2009 } @inproceedings{1034693, address = {Morristown, NJ, USA}, author = {Lee, Lillian}, booktitle = {Proceedings of the 37th annual meeting of the Association for Computational Linguistics on Computational Linguistics}, doi = {http://dx.doi.org/10.3115/1034678.1034693}, interhash = {508bc7126eaf0feede9ac5c47ce20238}, intrahash = {0dfba1feb19c33be82d8df3ebbec7b70}, isbn = {1-55860-609-3}, location = {College Park, Maryland}, pages = {25--32}, publisher = {Association for Computational Linguistics}, title = {Measures of distributional similarity}, url = {http://portal.acm.org/citation.cfm?id=1034693&dl=}, year = 1999 } @phdthesis{Curran:2003, abstract = {Lexical-semantic resources, including thesauri and WOR DNE T, have been successfully incor- porated into a wide range of applications in Natural Language Processing. However they are very difficult and expensive to create and maintain, and their usefulness has been severely hampered by their limited coverage, bias and inconsistency. Automated and semi-automated methods for developing such resources are therefore crucial for further resource development and improved application performance. Systems that extract thesauri often identify similar words using the distributional hypothesis that similar words appear in similar contexts. This approach involves using corpora to examine the contexts each word appears in and then calculating the similarity between context distri- butions. Different definitions of context can be used, and I begin by examining how different types of extracted context influence similarity. To be of most benefit these systems must be capable of finding synonyms for rare words. Reliable context counts for rare events can only be extracted from vast collections of text. In this dissertation I describe how to extract contexts from a corpus of over 2 billion words. I describe techniques for processing text on this scale and examine the trade-off between context accuracy, information content and quantity of text analysed. Distributional similarity is at best an approximation to semantic similarity. I develop improved approximations motivated by the intuition that some events in the context distribution are more indicative of meaning than others. For instance, the object-of-verb context wear is far more indicative of a clothing noun than get. However, existing distributional techniques do not effectively utilise this information. The new context-weighted similarity metric I propose in this dissertation significantly outperforms every distributional similarity metric described in the literature. Nearest-neighbour similarity algorithms scale poorly with vocabulary and context vector size. To overcome this problem I introduce a new context-weighted approximation algorithm with bounded complexity in context vector size that significantly reduces the system runtime with only a minor performance penalty. I also describe a parallelized version of the system that runs on a Beowulf cluster for the 2 billion word experiments. To evaluate the context-weighted similarity measure I compare ranked similarity lists against gold-standard resources using precision and recall-based measures from Information Retrieval, since the alternative, application-based evaluation, can often be influenced by distributional as well as semantic similarity. I also perform a detailed analysis of the final results using WOR DNE T. Finally, I apply my similarity metric to the task of assigning words to WOR DNE T semantic categories. I demonstrate that this new approach outperforms existing methods and overcomes some of their weaknesses. }, author = {Curran, James Richard}, date-added = {2007-12-03 15:18:56 -0500}, date-modified = {2008-07-04 12:38:50 -0400}, group = {Data Mining; Knowledge Organization}, interhash = {5b18146caf6a2e0dc18a95ef0cd4e6ce}, intrahash = {57de9154de9e4848eb5989f9ca7fdcbb}, school = {Institute for Communicating and Collaborative Systems School of Informatics University of Edinburgh}, title = {{From Distributional to Semantic Similarity}}, url = {http://www.era.lib.ed.ac.uk/bitstream/1842/563/2/IP030023.pdf }, year = 2003 } @misc{cattuto-2008, abstract = { Social bookmarking systems allow users to organise collections of resources on the Web in a collaborative fashion. The increasing popularity of these systems as well as first insights into their emergent semantics have made them relevant to disciplines like knowledge extraction and ontology learning. The problem of devising methods to measure the semantic relatedness between tags and characterizing it semantically is still largely open. Here we analyze three measures of tag relatedness: tag co-occurrence, cosine similarity of co-occurrence distributions, and FolkRank, an adaptation of the PageRank algorithm to folksonomies. Each measure is computed on tags from a large-scale dataset crawled from the social bookmarking system del.icio.us. To provide a semantic grounding of our findings, a connection to WordNet (a semantic lexicon for the English language) is established by mapping tags into synonym sets of WordNet, and applying there well-known metrics of semantic similarity. Our results clearly expose different characteristics of the selected measures of relatedness, making them applicable to different subtasks of knowledge extraction such as synonym detection or discovery of concept hierarchies.}, author = {Cattuto, Ciro and Benz, Dominik and Hotho, Andreas and Stumme, Gerd}, interhash = {cc62b733f6e0402db966d6dbf1b7711f}, intrahash = {78fd64c3db55e6387ebdeb6c40054542}, title = {Semantic Analysis of Tag Similarity Measures in Collaborative Tagging Systems}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:0805.2045}, year = 2008 } @article{wiley1991, address = {Department of Information Studies, University of Sheffield, Western Bank, Sheffield S10 2TN, United Kingdom}, author = {Peat, Helen J. and Willett, Peter}, doi = {10.1002/(SICI)1097-4571(199106)42:5<378::AID-ASI8>3.0.CO;2-8}, interhash = {b3cbc3b9fe13100408c5ef4c2c175b01}, intrahash = {96cc916a85ed034ea7fa72e4259b9f05}, journal = {Journal of the American Society for Information Science}, number = 5, pages = {378-383}, publisher = {Copyright © 1991 John Wiley & Sons, Inc.}, title = {The limitations of term co-occurrence data for query expansion in document retrieval systems}, url = {http://www.iro.umontreal.ca/~nie/IFT6255/Peat_Willett_QExp.pdf}, volume = 42, year = 1991 } @article{keyhere, abstract = {We have developed an efficient way to determine the syntactic similarity of files and have applied it to every document on the World Wide Web. Using this mechanism, we built a clustering of all the documents that are syntactically similar. Possible applications include a "Lost and Found" service, filtering the results of Web searches, updating widely distributed web-pages, and identifying violations of intellectual property rights.}, author = {Broder, Andrei Z. and Glassman, Steven C. and Manasse, Mark S. and Zweig, Geoffrey}, booktitle = {Papers from the Sixth International World Wide Web Conference}, interhash = {424cdc36335873e4d8c0bed6e07e872e}, intrahash = {93a3440b81c13ec81c17481a97719c71}, journal = {Computer Networks and ISDN Systems}, month = {#sep#}, number = {8-13}, pages = {1157--1166}, title = {Syntactic clustering of the Web}, url = {http://www.sciencedirect.com/science/article/B6TYT-3SP60S4-11/2/38f44c816ec8d69b406317de1629e56d}, volume = 29, year = 1997 } @inproceedings{ziegler2006detecting, address = {Munich, Germany, March 26-31}, author = {Ziegler, Patrick and Kiefer, Christoph and Sturm, Christoph and Dittrich, Klaus R. and Bernstein, Abraham}, booktitle = {10th International Conference on Extending Database Technology (EDBT 2006)}, editor = {Ioannidis, Yannis and Scholl, Marc H. and Schmidt, Joachim W. and Matthes, Florian and Hatzopoulos, Mike and Boehm, Klemens and Kemper, Alfons and Grust, Torsten and Boehm, Christian}, interhash = {ceef2889a8e6f3a7feeed81b8a3e2fd0}, intrahash = {5c9b572e42068353b11768b775cb33f9}, pages = {59-76}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Detecting Similarities in Ontologies with the SOQA-SimPack Toolkit}, volume = 3896, year = 2006 }