@article{steenweg2004neuere, abstract = {Ausgehend von neueren Entwicklungen im Publikations- und Informationswesen, die einen nicht unwesentlichen Einfluss auf das zukünftige wissenschaftliche Publizieren haben werden, wird der künftige Zugriff auf Open-Access-Archive thematisiert. Um dringend notwendige Veränderungen herbeizuführen, bedarf es neben einem Mentalitätswandel bei den Autoren auch der Schaffung einer funktionierenden umfassenden Infrastruktur von Dokumentenservern mit Recherche und Archivierung, um die nötige Akzeptanz bei Autoren und Lesern zu erlangen. Eine Möglichkeit dazu böte eine OpenRep-Initiative, ein Netz von modular aufgebauten "Institutional Repositories".}, author = {Steenweg, Helge}, interhash = {8b414acafb896664d8fc05445eec6177}, intrahash = {8ffaeaf3cbc35354380da6a0a7c32aac}, journal = {ABI-Technik}, number = 4, pages = {282-293.}, title = {Neuere Entwicklungen im Informations- und Publikationswesen und ihre möglichen Auswirkungen}, url = {http://kobra.bibliothek.uni-kassel.de/bitstream/urn:nbn:de:hebis:34-200602086382/6/abi4-2004-04.pdf}, volume = 24, year = 2004 } @book{manning2008, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, year = 2008 } @inproceedings{jaschke2013attribute, abstract = {We propose an approach for supporting attribute exploration by web information retrieval, in particular by posing appropriate queries to search engines, crowd sourcing systems, and the linked open data cloud. We discuss underlying general assumptions for this to work and the degree to which these can be taken for granted.}, author = {Jäschke, Robert and Rudolph, Sebastian}, booktitle = {Contributions to the 11th International Conference on Formal Concept Analysis}, editor = {Cellier, Peggy and Distel, Felix and Ganter, Bernhard}, interhash = {000ab7b0ae3ecd1d7d6ceb39de5c11d4}, intrahash = {45e900e280661d775d8da949baee3747}, month = may, organization = {Technische Universität Dresden}, pages = {19--34}, title = {Attribute Exploration on the Web}, url = {http://nbn-resolving.de/urn:nbn:de:bsz:14-qucosa-113133}, urn = {urn:nbn:de:bsz:14-qucosa-113133}, year = 2013 } @techreport{ritchie2009citation, abstract = {This thesis investigates taking words from around citations to scientific papers in order to create an enhanced document representation for improved information retrieval. This method parallels how anchor text is commonly used in Web retrieval. In previous work, words from citing documents have been used as an alternative representation of the cited document but no previous experiment has combined them with a full-text document representation and measured effectiveness in a large scale evaluation. The contributions of this thesis are twofold: firstly, we present a novel document representation, along with experiments to measure its effect on retrieval effectiveness, and, secondly, we document the construction of a new, realistic test collection of scientific research papers, with references (in the bibliography) and their associated citations (in the running text of the paper) automatically annotated. Our experiments show that the citation-enhanced document representation increases retrieval effectiveness across a range of standard retrieval models and evaluation measures. In Chapter 2, we give the background to our work, discussing the various areas from which we draw together ideas: information retrieval, particularly link structure analysis and anchor text indexing, and bibliometrics, in particular citation analysis. We show that there is a close relatedness of ideas between these areas but that these ideas have not been fully explored experimentally. Chapter 3 discusses the test collection paradigm for evaluation of information retrieval systems and describes how and why we built our test collection. In Chapter 4, we introduce the ACL Anthology, the archive of computational linguistics papers that our test collection is centred around. The archive contains the most prominent publications since the beginning of the field in the early 1960s, consisting of one journal plus conferences and workshops, resulting in over 10,000 papers. Chapter 5 describes how the PDF papers are prepared for our experiments, including identification of references and citations in the papers, once converted to plain text, and extraction of citation information to an XML database. Chapter 6 presents our experiments: we show that adding citation terms to the full-text of the papers improves retrieval effectiveness by up to 7.4%, that weighting citation terms higher relative to paper terms increases the improvement and that varying the context from which citation terms are taken has a significant effect on retrieval effectiveness. Our main hypothesis that citation terms enhance a full-text representation of scientific papers is thus proven. There are some limitations to these experiments. The relevance judgements in our test collection are incomplete but we have experimentally verified that the test collection is, nevertheless, a useful evaluation tool. Using the Lemur toolkit constrained the method that we used to weight citation terms; we would like to experiment with a more realistic implementation of term weighting. Our experiments with different citation contexts did not conclude an optimal citation context; we would like to extend the scope of our investigation. Now that our test collection exists, we can address these issues in our experiments and leave the door open for more extensive experimentation. }, address = {Cambridge, UK}, author = {Ritchie, Anna}, institution = {University of Cambridge}, interhash = {f086fdcd7eb1df44ef67b96f2e91996c}, intrahash = {aa4271a2a958fe2c1a65dbdd508d8de7}, issn = {1476-2986}, month = mar, number = 744, title = {Citation context analysis for information retrieval}, url = {https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-744.pdf}, year = 2009 } @book{manning2008introduction, abstract = {"Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective. Slides and additional exercises (with solutions for lecturers) are also available through the book's supporting website to help course instructors prepare their lectures." -- Publisher's description.}, address = {New York}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, isbn = {9780521865715 0521865719}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, url = {http://www.amazon.com/Introduction-Information-Retrieval-Christopher-Manning/dp/0521865719/ref=sr_1_1?ie=UTF8&qid=1337379279&sr=8-1}, year = 2008 } @book{koester2006fooca, abstract = {This book deals with Formal Concept Analysis (FCA) and its application to Web Information Retrieval. It explains how Web search results retrieved by major Web search engines such as Google or Yahoo can be conceptualized leading to a human-oriented form of representation. A generalization of Web search results is conducted, leading to an FCA-based introduction of FooCA. FooCA is an application in the field of Conceptual Knowledge Processing and supports the idea of a holistic representation of Web Information Retrieval.}, address = {Mühltal}, author = {Koester, Bjoern}, interhash = {fe53b2b1fa6be34259647954fca36bf8}, intrahash = {5571d950ada3ee1892e5c043ac438271}, publisher = {Verlag Allgemeine Wissenschaft}, series = {Beiträge zur begrifflichen Wissensverarbeitung}, title = {FooCA: web information retrieval with formal concept analysis}, url = {http://www.bjoern-koester.de/fooca/web_information_retrieval_with_formal_concept_analysis.html}, year = 2006 } @inproceedings{joachims2002optimizing, abstract = {This paper presents an approach to automatically optimizing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous approaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them difficult and expensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be recorded at very low cost. Taking a Support Vector Machine (SVM) approach, this paper presents a method for learning retrieval functions. From a theoretical perspective, this method is shown to be well-founded in a risk minimization framework. Furthermore, it is shown to be feasible even for large sets of queries and features. The theoretical results are verified in a controlled experiment. It shows that the method can effectively adapt the retrieval function of a meta-search engine to a particular group of users, outperforming Google in terms of retrieval quality after only a couple of hundred training examples.}, acmid = {775067}, address = {New York, NY, USA}, author = {Joachims, Thorsten}, booktitle = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {10.1145/775047.775067}, interhash = {c78df69370bbf12636eaa5233b1fba83}, intrahash = {656a83f1057c5792506d0d656ae81d26}, isbn = {1-58113-567-X}, location = {Edmonton, Alberta, Canada}, numpages = {10}, pages = {133--142}, publisher = {ACM}, title = {Optimizing search engines using clickthrough data}, url = {http://doi.acm.org/10.1145/775047.775067}, year = 2002 } @techreport{gomes2012creating, abstract = {The web became a mass means of publication that has been replacing printed media. However, its information is extremely ephemeral. Currently, most of the information available on the web is less than 1 year old. There are several initiatives worldwide that struggle to archive information from the web before it vanishes. However, search mechanisms to access this information are still limited and do not satisfy their users that demand performance similar to live- web search engines. This paper presents some of the work developed to create an effi�cient and effective searchable web archive service, from data acquisition to user interface design. The results of research were applied in practice to create the Portuguese Web Archive that is publicly available since January 2010. It supports full-text search over 1 billion contents archived from 1996 to 2010. The developed software is available as an open source project.}, address = {Portugal}, author = {Gomes, Daniel and Cruz, David and Miranda, João and Costa, Miguel and Fontes, Simão}, institution = {Foundation for National Scientific Computing}, interhash = {b5c01e5cadcc1d8ef44d48b2022144d2}, intrahash = {da5b8a339b2c3d765c3b0a7bd025af82}, month = may, title = {Creating a searchable web archive}, url = {http://web.ist.utl.pt/joaocarvalhomiranda/docs/other/creating-a-searchable-web-archive-relatorio.pdf}, year = 2012 } @article{alonso2008crowdsourcing, abstract = {Relevance evaluation is an essential part of the development and maintenance of information retrieval systems. Yet traditional evaluation approaches have several limitations; in particular, conducting new editorial evaluations of a search system can be very expensive. We describe a new approach to evaluation called TERC, based on the crowdsourcing paradigm, in which many online users, drawn from a large community, each performs a small evaluation task.}, acmid = {1480508}, address = {New York, NY, USA}, author = {Alonso, Omar and Rose, Daniel E. and Stewart, Benjamin}, doi = {10.1145/1480506.1480508}, interhash = {8441d7fed92813634f61fa148ef2b870}, intrahash = {4a47833e85558b740788607cb79ba795}, issn = {0163-5840}, issue_date = {December 2008}, journal = {SIGIR Forum}, month = nov, number = 2, numpages = {7}, pages = {9--15}, publisher = {ACM}, title = {Crowdsourcing for relevance evaluation}, url = {http://doi.acm.org/10.1145/1480506.1480508}, volume = 42, year = 2008 } @article{cha2007comprehensive, abstract = {Distance or similarity measures are essential to solve many pattern recognition problems such as classification, clustering, and retrieval problems. Various distance/similarity measures that are applicable to compare two probability density functions, pdf in short, are reviewed and categorized in both syntactic and semantic relationships. A correlation coefficient and a hierarchical clustering technique are adopted to reveal similarities among numerous distance/similarity measures.}, author = {Cha, Sung-Hyuk}, interhash = {dfaf5e38d33eaab89f3643b242910c81}, intrahash = {69e7c9ba92a049efa4c70f8f0bfdb4ea}, journal = {International Journal of Mathematical Models and Methods in Applied Sciences}, number = 4, pages = {300--307}, title = {Comprehensive Survey on Distance/Similarity Measures between Probability Density Functions}, url = {http://www.gly.fsu.edu/~parker/geostats/Cha.pdf}, volume = 1, year = 2007 } @book{manning2008introduction, abstract = {"Class-tested and coherent, this textbook teaches classical and web information retrieval, including web search and the related areas of text classification and text clustering from basic concepts. It gives an up-to-date treatment of all aspects of the design and implementation of systems for gathering, indexing, and searching documents; methods for evaluating systems; and an introduction to the use of machine learning methods on text collections. All the important ideas are explained using examples and figures, making it perfect for introductory courses in information retrieval for advanced undergraduates and graduate students in computer science. Based on feedback from extensive classroom experience, the book has been carefully structured in order to make teaching more natural and effective. Slides and additional exercises (with solutions for lecturers) are also available through the book's supporting website to help course instructors prepare their lectures." -- Publisher's description.}, address = {New York}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Schütze, Hinrich}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {9f4ab13e07b48b9723113aa74224be65}, isbn = {9780521865715 0521865719}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, url = {http://www.amazon.com/Introduction-Information-Retrieval-Christopher-Manning/dp/0521865719/ref=sr_1_1?ie=UTF8&qid=1337379279&sr=8-1}, year = 2008 } @inproceedings{poelmans2011mining, abstract = {Formal Concept Analysis (FCA) is an unsupervised clustering technique and many scientific papers are devoted to applying FCA in Information Retrieval (IR) research. We collected 103 papers published between 2003-2009 which mention FCA and information retrieval in the abstract, title or keywords. Using a prototype of our FCA-based toolset CORDIET, we converted the pdf-files containing the papers to plain text, indexed them with Lucene using a thesaurus containing terms related to FCA research and then created the concept lattice shown in this paper. We visualized, analyzed and explored the literature with concept lattices and discovered multiple interesting research streams in IR of which we give an extensive overview. The core contributions of this paper are the innovative application of FCA to the text mining of scientific papers and the survey of the FCA-based IR research. }, author = {Poelmans, Jonas and Elzinga, Paul and Viaene, Stijn and Dedene, Guido and Kuznetsov, Sergei O.}, booktitle = {Industrial Conference on Data Mining - Poster and Industry Proceedings}, editor = {Perner, Petra}, interhash = {b44d11ea5b5a4df8ee30a9c572d82051}, intrahash = {164c37be60c1a47d1727ad9b82f01237}, isbn = {978-3-942954-06-4}, pages = {82--96}, publisher = {IBaI Publishing}, title = {Text Mining Scientific Papers: a Survey on {FCA}-based Information Retrieval Research.}, url = {http://dblp.uni-trier.de/db/conf/incdm/incdm2011p.html#PoelmansEVDK11}, year = 2011 } @inproceedings{hotho2006folkrank, abstract = { In social bookmark tools users are setting up lightweight conceptual structures called folksonomies. Currently, the information retrieval support is limited. We present a formal model and a new search algorithm for folksonomies, called FolkRank, that exploits the structure of the folksonomy. The proposed algorithm is also applied to find communities within the folksonomy and is used to structure search results. All findings are demonstrated on a large scale dataset. A long version of this paper has been published at the European Semantic Web Conference 2006.}, author = {Hotho, Andreas and Jäschke, Robert and Schmitz, Christoph and Stumme, Gerd}, booktitle = {Proc. FGIR 2006}, interhash = {3468dc3fed17eadf2e7c6ff06fbb34a3}, intrahash = {4d8b4f79814691fbe6db8357d63206a1}, title = {FolkRank: A Ranking Algorithm for Folksonomies}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2006/hotho2006folkrank.pdf}, year = 2006 } @proceedings{themenheft2007webmining, editor = {Hotho, Andreas and Stumme, Gerd}, interhash = {83c28b86f2ac897e906660e54e6fffc0}, intrahash = {c73311bb72ad480d74125dbc9d94c450}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Themenheft Web Mining, Künstliche Intelligenz}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @article{themenheft2007webmining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {39f94bf3a1663d9cec6a6cb8354a9bd9}, intrahash = {e9535ec82afa53f44a1b37704aa9a71f}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web -- Methods, Ap- plications, and Perspectives}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @book{metzler2011featurecentric, asin = {3642228976}, author = {Metzler, Donald}, dewey = {005}, ean = {9783642228971}, edition = 2012, interhash = {4e473a9657c556434612d006a5a21460}, intrahash = {22e5fe8501844167b64a5aed595f4372}, isbn = {3642228976}, publisher = {Springer}, title = {A Feature-Centric View of Information Retrieval}, url = {http://www.amazon.com/Feature-Centric-View-Information-Retrieval/dp/3642228976}, year = 2011 } @inproceedings{zhou2005document, abstract = {The quality of document content, which is an issue that is usually ignored for the traditional ad hoc retrieval task, is a critical issue for Web search. Web pages have a huge variation in quality relative to, for example, newswire articles. To address this problem, we propose a document quality language model approach that is incorporated into the basic query likelihood retrieval model in the form of a prior probability. Our results demonstrate that, on average, the new model is significantly better than the baseline (query likelihood model) in terms of precision at the top ranks.}, acmid = {1099652}, address = {New York, NY, USA}, author = {Zhou, Yun and Croft, W. Bruce}, booktitle = {Proceedings of the 14th ACM International Conference on Information and Knowledge Management}, doi = {10.1145/1099554.1099652}, interhash = {01264e5f48959d326724b405d3898337}, intrahash = {d190feee02f804aea11f19979d3642b8}, isbn = {1-59593-140-6}, location = {Bremen, Germany}, numpages = {2}, pages = {331--332}, publisher = {ACM}, series = {CIKM '05}, title = {Document quality models for web ad hoc retrieval}, url = {http://doi.acm.org/10.1145/1099554.1099652}, year = 2005 } @article{broder2002taxonomy, author = {Broder, A.}, interhash = {1bfc1fd93c01979b73e05ae519a46bce}, intrahash = {36085c6aefab8fc5bc9903e2ecb96e00}, journal = {ACM SIGIR Forum}, number = 2, pages = {3-10}, title = {A taxonomy of Web search}, volume = 36, year = 2002 } @article{PeSt08, abstract = {Folksonomies in Wissensrepr{\"a}sentation und Information Retrieval. Die popul{\"a}ren Web 2.0-Dienste werden von Prosumern -- Produzenten und gleichsam Konsumenten -- nicht nur dazu genutzt, Inhalte zu produzieren, sondern auch, um sie inhaltlich zu erschlie{\ss}en. Folksonomies erlauben es dem Nutzer, Dokumente mit eigenen Schlagworten, sog. Tags, zu beschreiben, ohne dabei auf gewisse Regeln oder Vorgaben achten zu m{\"u}ssen. Neben einigen Vorteilen zeigen Folksonomies aber auch zahlreiche Schw{\"a}chen (u. a. einen Mangel an Pr{\"a}zision). Um diesen Nachteilen gr{\"o}{\ss}tenteils entgegenzuwirken, schlagen wir eine Interpretation der Tags als nat{\"u}rlichsprachige W{\"o}rter vor. Dadurch ist es uns m{\"o}glich, Methoden des Natural Language Processing (NLP) auf die Tags anzuwenden und so linguistische Probleme der Tags zu beseitigen. Dar{\"u}ber hinaus diskutieren wir Ans{\"a}tze und weitere Vorschl{\"a}ge (Tagverteilungen, Kollaboration und akteurspezifische Aspekte) hinsichtlich eines Relevance Rankings von getaggten Dokumenten. Neben Vorschl{\"a}gen auf {\"a}hnliche Dokumente ({\glqq}more like this!{\grqq}) erlauben Folksonomies auch Hinweise auf verwandte Nutzer und damit auf Communities ({\glqq}more like me!{\grqq}). Folksonomies in Knowledge Representation and Information Retrieval In Web 2.0 services {\grqq}prosumers” -- producers and consumers -- collaborate not only for the purpose of creating content, but to index these pieces of information as well. Folksonomies permit actors to describe documents with subject headings, {\grqq}tags{\grqq}, without regarding any rules. Apart from a lot of benefits folksonomies have many shortcomings (e.g., lack of precision). In order to solve some of the problems we propose interpreting tags as natural language terms. Accordingly, we can introduce methods of NLP to solve the tags’ linguistic problems. Additionally, we present criteria for tagged documents to create a ranking by relevance (tag distribution, collaboration and actor-based aspects). Besides recommending similar documents ({\glqq}more like this!{\grqq}) folksonomies can be used for the recommendation of similar users and communities ({\glqq}more like me!{\grqq}). }, author = {Peters, Isabella and Stock, Wolfgang G.}, interhash = {93b09c0700650150065232180fb23115}, intrahash = {3abe2759f6837cbd247021cb26bcf760}, issn = {1434-4653}, journal = {Information -- Wissenschaft und Praxis}, localfile = {Wissenschaftliche Bibliothek/dokumente/StPe08.pdf}, number = 2, pages = {77--90}, title = {{Folksonomies in Wissensrepr{\"a}sentation und Information Retrieval}}, url = {http://www.phil-fak.uni-duesseldorf.de/infowiss/admin/public_dateien/files/1/1204547968stock212_h.htm}, volume = {59 }, year = 2008 } @article{steenweg2004neuere, abstract = {Ausgehend von neueren Entwicklungen im Publikations- und Informationswesen, die einen nicht unwesentlichen Einfluss auf das zukünftige wissenschaftliche Publizieren haben werden, wird der künftige Zugriff auf Open-Access-Archive thematisiert. Um dringend notwendige Veränderungen herbeizuführen, bedarf es neben einem Mentalitätswandel bei den Autoren auch der Schaffung einer funktionierenden umfassenden Infrastruktur von Dokumentenservern mit Recherche und Archivierung, um die nötige Akzeptanz bei Autoren und Lesern zu erlangen. Eine Möglichkeit dazu böte eine OpenRep-Initiative, ein Netz von modular aufgebauten "Institutional Repositories".}, author = {Steenweg, Helge}, interhash = {8b414acafb896664d8fc05445eec6177}, intrahash = {8ffaeaf3cbc35354380da6a0a7c32aac}, journal = {ABI-Technik}, number = 4, pages = {282-293.}, title = {Neuere Entwicklungen im Informations- und Publikationswesen und ihre möglichen Auswirkungen}, url = {http://kobra.bibliothek.uni-kassel.de/bitstream/urn:nbn:de:hebis:34-200602086382/6/abi4-2004-04.pdf}, volume = 24, year = 2004 } @book{hearst2009search, asin = {0521113792}, author = {Hearst, Marti A.}, dewey = {006.7784}, ean = {9780521113793}, edition = 1, interhash = {a4903a26cab5e66d68595e31a7fa4b2f}, intrahash = {148875a34f84ca74f51bf0bd5f7a21e1}, isbn = {0521113792}, publisher = {Cambridge University Press}, title = {Search User Interfaces}, url = {http://searchuserinterfaces.com/book/}, year = 2009 } @incollection{BaezaYates2008From, abstract = {Semantic search seems to be an elusive and fuzzy target to IR, SW and NLP researchers. One reason is that this challenge lies in between all those fields, which implies a broad scope of issues and technologies that must be mastered. In this extended abstract we survey the work of Yahoo! Research at Barcelona to approach this problem. Our research is intended to produce a virtuous feedback circuit by using machine learning for capturing semantics, and, ultimately, for better search.}, author = {Baeza-Yates, Ricardo}, citeulike-article-id = {3094839}, doi = {http://dx.doi.org/10.1007/978-3-540-68234-9\_1}, interhash = {ca208582cd53cb7b5492c86e89be9514}, intrahash = {5f45b011336a39b8b157d7c06ddbc1ba}, journal = {The Semantic Web: Research and Applications}, pages = {1--2}, posted-at = {2008-08-07 10:32:09}, priority = {2}, title = {From Capturing Semantics to Semantic Search: A Virtuous Cycle}, url = {http://dx.doi.org/10.1007/978-3-540-68234-9\_1}, year = 2008 } @article{ponte1998lma, author = {Ponte, J.M. and Croft, W.B.}, booktitle = {Proceedings of the 21st annual international ACM SIGIR conference on Research and development in information retrieval}, interhash = {7d5d602886fa34e485cf6194f70bd793}, intrahash = {229b65aa2b99b2f27bc990840e79b3eb}, organization = {ACM New York, NY, USA}, pages = {275--281}, title = {{A language modeling approach to information retrieval}}, year = 1998 } @article{lavrenko2002rmt, author = {Lavrenko, V. and Allan, J. and DeGuzman, E. and LaFlamme, D. and Pollard, V. and Thomas, S.}, booktitle = {Proceedings of the second international conference on Human Language Technology Research}, interhash = {53d2b89bae4f297a5dc64fe73c3e8d05}, intrahash = {fa37fe4bfe3d3d447bab1681b1d6aef3}, organization = {Morgan Kaufmann Publishers Inc. San Francisco, CA, USA}, pages = {115--121}, title = {{Relevance models for topic detection and tracking}}, year = 2002 } @article{Robertson-Relevance-1976, abstract = {This paper examines statistical techniques for exploiting relevance information to weight search terms. These techniques are presented as a natural extension of weighting methods using information about the distribution of index terms in documents in general. A series of relevance weighting functions is derived and is justified by theoretical considerations. In particular, it is shown that specific weighted search methods are implied by a general probabilistic theory of retrieval. Different applications of relevance weighting are illustrated by experimental results for test collections.}, address = {School of Library, Archive and Information Studies University College London London WC1E 6BT, England; Computer Laboratory University of Cambridge Cambridge CB2 3QG, England}, author = {Robertson, S. E. and Jones, Sparck K.}, citeulike-article-id = {1839956}, doi = {10.1002/asi.4630270302}, interhash = {67e5814e51aa3fdddadc2e8274bcb03d}, intrahash = {0b1b36aa39c0f00d286a56054d56aee5}, journal = {Journal of the American Society for Information Science}, number = 3, pages = {129--146}, posted-at = {2007-10-30 11:03:54}, priority = {4}, title = {Relevance weighting of search terms}, url = {http://dx.doi.org/10.1002/asi.4630270302}, volume = 27, year = 1976 } @article{furnas1988iru, author = {Furnas, GW and Deerwester, S. and Dumais, ST and Landauer, TK and Harshman, RA and Streeter, LA and Lochbaum, KE}, booktitle = {Proceedings of the 11th annual international ACM SIGIR conference on Research and development in information retrieval}, interhash = {78be8a4a06c0453b1db414c157c00d60}, intrahash = {b7ee6989f3c0a0e7e98f4e81ff30a21f}, organization = {ACM New York, NY, USA}, pages = {465--480}, title = {{Information retrieval using a singular value decomposition model of latent semantic structure}}, year = 1988 } @article{deerwester1990ils, author = {Deerwester, S. and Dumais, S.T. and Furnas, G.W. and Landauer, T.K. and Harshman, R.}, interhash = {c15e0f019b2b967d224e7443100e8ff9}, intrahash = {c4b3c80072a4c342ac64c663401db5cb}, journal = {Journal of the American society for information science}, number = 6, pages = {391--407}, title = {{Indexing by latent semantic analysis}}, volume = 41, year = 1990 } @phdthesis{lavrenko2004gs, author = {Lavrenko, Victor}, interhash = {d9c9cc57ae903034fb83faf1d12d950c}, intrahash = {0f7f08c36944c888e134140ee3966434}, title = {A Generative Theory of Relevance}, year = 2004 } @book{lavrenko2008gtr, author = {Lavrenko, V.}, interhash = {c1158e284687eded75ad24c9c39d08d3}, intrahash = {f3812a599906985fd53a64bc60c6b766}, publisher = {Springer}, title = {{A generative theory of relevance}}, year = 2008 } @article{viet2009latent, abstract = {Relevance-based language models operate by estimating the probabilities of observing words in documents relevant (or pseudo relevant) to a topic. However, these models assume that if a document is relevant to a topic, then all tokens in the documentare relevant to that topic. This could limit model robustness and effectiveness. In this study, we propose a Latent Dirichletrelevance model, which relaxes this assumption. Our approach derives from current research on Latent Dirichlet Allocation(LDA) topic models. LDA has been extensively explored, especially for discovering a set of topics from a corpus. LDA itself,however, has a limitation that is also addressed in our work. Topics generated by LDA from a corpus are synthetic, i.e., theydo not necessarily correspond to topics identified by humans for the same corpus. In contrast, our model explicitly considersthe relevance relationships between documents and given topics (queries). Thus unlike standard LDA, our model is directlyapplicable to goals such as relevance feedback for query modification and text classification, where topics (classes and queries)are provided upfront. Thus although the focus of our paper is on improving relevance-based language models, in effect ourapproach bridges relevance-based language models and LDA addressing limitations of both.}, author = {Ha-Thuc, Viet and Srinivasan, Padmini}, interhash = {90e160e155f6c30cf8047231ac582469}, intrahash = {e33cdabe533c95b97ec11788f3d6d85e}, journal = {Information Retrieval Technology}, pages = {13--25}, title = {A Latent Dirichlet Framework for Relevance Modeling}, url = {http://dx.doi.org/10.1007/978-3-642-04769-5_2}, year = 2009 } @article{rocchio1971rfi, author = {Rocchio, J.J.}, interhash = {c18d843e34fe4f8bd1d2438227857225}, intrahash = {fc6b8550bc2e30ece2fdf484139148f5}, journal = {The SMART retrieval system: experiments in automatic document processing}, pages = {313--323}, title = {{Relevance feedback in information retrieval}}, year = 1971 } @article{keyhere, abstract = {The theory of concept (or Galois) lattices provides a simple and formal approach to conceptual clustering. In this paper we present GALOIS, a system that automates and applies this theory. The algorithm utilized by GALOIS to build a concept lattice is incremental and efficient, each update being done in time at most quadratic in the number of objects in the lattice. Also, the algorithm may incorporate background information into the lattice, and through clustering, extend the scope of the theory. The application we present is concerned with information retrieval via browsing, for which we argue that concept lattices may represent major support structures. We describe a prototype user interface for browsing through the concept lattice of a document-term relation, possibly enriched with a thesaurus of terms. An experimental evaluation of the system performed on a medium-sized bibliographic database shows good retrieval performance and a significant improvement after the introduction of background knowledge. ER -}, author = {Carpineto, Claudio and Romano, Giovanni}, interhash = {719ac1badf95acafafbd1487d82ae175}, intrahash = {a53905954aeef0a80ec7424f978bca14}, journal = {Machine Learning}, month = {#aug#}, number = 2, pages = {95--122}, title = {A lattice conceptual clustering system and its application to browsing retrieval}, url = {http://dx.doi.org/10.1007/BF00058654}, volume = 24, year = 1996 } @article{themenheft2007webmining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {39f94bf3a1663d9cec6a6cb8354a9bd9}, intrahash = {e9535ec82afa53f44a1b37704aa9a71f}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web -- Methods, Ap- plications, and Perspectives}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @inproceedings{hotho03wordnet, address = {Toronto}, author = {Hotho, A and Staab, S. and Stumme, G.}, booktitle = {Proc. SIGIR Semantic Web Workshop}, comment = {alpha}, interhash = {c2a9a89ce20cef90a1e78d34dc2c2afe}, intrahash = {04c7d86337d68e4ed9ae637029c43414}, privnote = {alpha}, title = {Wordnet improves text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003wordnet.pdf}, year = 2003 } @inproceedings{hotho2006folkrank, abstract = { In social bookmark tools users are setting up lightweight conceptual structures called folksonomies. Currently, the information retrieval support is limited. We present a formal model and a new search algorithm for folksonomies, called FolkRank, that exploits the structure of the folksonomy. The proposed algorithm is also applied to find communities within the folksonomy and is used to structure search results. All findings are demonstrated on a large scale dataset. A long version of this paper has been published at the European Semantic Web Conference 2006.}, author = {Hotho, Andreas and Jäschke, Robert and Schmitz, Christoph and Stumme, Gerd}, booktitle = {Proc. FGIR 2006}, interhash = {3468dc3fed17eadf2e7c6ff06fbb34a3}, intrahash = {4d8b4f79814691fbe6db8357d63206a1}, title = {FolkRank: A Ranking Algorithm for Folksonomies}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2006/hotho2006folkrank.pdf}, year = 2006 } @proceedings{themenheft2007webmining, editor = {Hotho, Andreas and Stumme, Gerd}, interhash = {83c28b86f2ac897e906660e54e6fffc0}, intrahash = {c73311bb72ad480d74125dbc9d94c450}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Themenheft Web Mining, Künstliche Intelligenz}, url = {http://www.kuenstliche-intelligenz.de/index.php?id=7758}, year = 2007 } @article{1347495, abstract = {In this paper, we propose a common phrase index as an efficient index structure to support phrase queries in a very large text database. Our structure is an extension of previous index structures for phrases and achieves better query efficiency with modest extra storage cost. Further improvement in efficiency can be attained by implementing our index according to our observation of the dynamic nature of common word set. In experimental evaluation, a common phrase index using 255 common words has an improvement of about 11% and 62% in query time for the overall and large queries (queries of long phrases) respectively over an auxiliary nextword index. Moreover, it has only about 19% extra storage cost. Compared with an inverted index, our improvement is about 72% and 87% for the overall and large queries respectively. We also propose to implement a common phrase index with dynamic update feature. Our experiments show that more improvement in time efficiency can be achieved.}, address = {Tarrytown, NY, USA}, author = {Chang, Matthew and Poon, Chung Keung}, doi = {http://dx.doi.org/10.1016/j.ipm.2007.06.003}, interhash = {e06f855c6e3bea229b8a3962b4ae09ce}, intrahash = {173b15d1d17a8e122f49b1c640d46b15}, issn = {0306-4573}, journal = {Inf. Process. Manage.}, number = 2, pages = {756--769}, publisher = {Pergamon Press, Inc.}, title = {Efficient phrase querying with common phrase index}, url = {http://portal.acm.org/citation.cfm?id=1347495}, volume = 44, year = 2008 } @article{jarvelin2002cumulated, abstract = {Modern large retrieval environments tend to overwhelm their users by their large output. Since all documents are not of equal relevance to their users, highly relevant documents should be identified and ranked first for presentation. In order to develop IR techniques in this direction, it is necessary to develop evaluation approaches and methods that credit IR methods for their ability to retrieve highly relevant documents. This can be done by extending traditional evaluation methods, that is, recall and precision based on binary relevance judgments, to graded relevance judgments. Alternatively, novel measures based on graded relevance judgments may be developed. This article proposes several novel measures that compute the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. The first one accumulates the relevance scores of retrieved documents along the ranked result list. The second one is similar but applies a discount factor to the relevance scores in order to devaluate late-retrieved documents. The third one computes the relative-to-the-ideal performance of IR techniques, based on the cumulative gain they are able to yield. These novel measures are defined and discussed and their use is demonstrated in a case study using TREC data: sample system run results for 20 queries in TREC-7. As a relevance base we used novel graded relevance judgments on a four-point scale. The test results indicate that the proposed measures credit IR methods for their ability to retrieve highly relevant documents and allow testing of statistical significance of effectiveness differences. The graphs based on the measures also provide insight into the performance IR techniques and allow interpretation, for example, from the user point of view.}, address = {New York, NY, USA}, author = {Järvelin, Kalervo and Kekäläinen, Jaana}, doi = {10.1145/582415.582418}, interhash = {c46348827790803e8e7465ffd1a13376}, intrahash = {12176d90012ed75f57996af0b9240d02}, issn = {1046-8188}, journal = {ACM Transactions on Information Systems}, month = oct, number = 4, pages = {422--446}, publisher = {ACM}, title = {Cumulated gain-based evaluation of IR techniques}, url = {http://portal.acm.org/citation.cfm?id=582418}, volume = 20, year = 2002 } @inproceedings{jarvelin2000ir, abstract = {This paper proposes evaluation methods based on the use of non-dichotomous relevance judgements in IR experiments. It is argued that evaluation methods should credit IR methods for their ability to retrieve highly relevant documents. This is desirable from the user point of view in modern large IR environments. The proposed methods are (1) a novel application of P-R curves and average precision computations based on separate recall bases for documents of different degrees of relevance, and (2) two novel measures computing the cumulative gain the user obtains by examining the retrieval result up to a given ranked position. We then demonstrate the use of these evaluation methods in a case study on the effectiveness of query types, based on combinations of query structures and expansion, in retrieving documents of various degrees of relevance. The test was run with a best match retrieval system (In-Query1) in a text database consisting of newspaper articles. The results indicate that the tested strong query structures are most effective in retrieving highly relevant documents. The differences between the query types are practically essential and statistically significant. More generally, the novel evaluation methods and the case demonstrate that non-dichotomous relevance assessments are applicable in IR experiments, may reveal interesting phenomena, and allow harder testing of IR methods.}, address = {New York, NY, USA}, author = {Järvelin, Kalervo and Kekäläinen, Jaana}, booktitle = {SIGIR '00: Proceedings of the 23rd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval}, doi = {10.1145/345508.345545}, interhash = {a62a44c48d24acc64cd6713f21111d72}, intrahash = {12592d5f805db5bd127ee5abae1a4325}, isbn = {1-58113-226-3}, location = {Athens, Greece}, pages = {41--48}, publisher = {ACM}, title = {IR evaluation methods for retrieving highly relevant documents}, url = {http://portal.acm.org/citation.cfm?id=345545}, year = 2000 } @book{baeza-yates1999modern, abstract = {This is a rigorous and complete textbook for a first course on information retrieval from the computer science (as opposed to a user-centred) perspective. The advent of the Internet and the enormous increase in volume of electronically stored information generally has led to substantial work on IR from the computer science perspective - this book provides an up-to-date student oriented treatment of the subject.}, address = {Boston, MA, USA}, author = {Baeza-Yates, Ricardo A. and Ribeiro-Neto, Berthier}, interhash = {6f78177742b3c836218aacfc7fc4c43c}, intrahash = {ead0b4af17c94074fe1c774d2f267617}, isbn = {020139829X}, publisher = {Addison-Wesley Longman Publishing Co., Inc.}, title = {Modern Information Retrieval}, url = {http://portal.acm.org/citation.cfm?id=553876}, year = 1999 } @book{rijsbergen79information, address = {London}, author = {van Rijsbergen, C. J.}, edition = 2, interhash = {0edccdac9af024f458911b82f61686ab}, intrahash = {b53893655b48140d4310a848dbf204d3}, publisher = {Butterworths}, title = {Information retrieval}, url = {http://www.dcs.gla.ac.uk/Keith/Preface.html}, year = 1979 } @book{books/mk/WittenMB99, author = {Witten, Ian H. and Moffat, Alistair and Bell, Timothy C.}, interhash = {e27de62d31e9665025dc788cb30027d5}, intrahash = {99a44801fa131f7fda77e769791f5f78}, isbn = {1-55860-570-3}, publisher = {Morgan Kaufmann}, title = {Managing Gigabytes: Compressing and Indexing Documents and Images, Second Edition}, url = {http://www.cs.mu.oz.au/mg/}, year = 1999 } @book{books/aw/Baeza-YatesR99, author = {Baeza-Yates, Ricardo A. and Ribeiro-Neto, Berthier A.}, interhash = {6f78177742b3c836218aacfc7fc4c43c}, intrahash = {16ab70975f635f8d72de82e2ef3ef9de}, isbn = {0-201-39829-X}, publisher = {ACM Press / Addison-Wesley}, title = {Modern Information Retrieval}, url = {http://www.ischool.berkeley.edu/~hearst/irbook/glossary.html}, year = 1999 } @book{ferber2003information, address = {Heidelberg}, author = {Ferber, Reginald}, interhash = {52c1b4ab3e818efef6635eb76b778608}, intrahash = {b60dbc902a2e19877aec154fa5747751}, publisher = {dpunkt Verlag}, title = {Information Retrieval: Suchmodelle und Data-Mining-Verfahren für Textsammlungen und das Web}, url = {http://information-retrieval.de/}, year = 2003 } @book{manning2008, author = {Manning, C. D. and Raghavan, P. and Schütze, H.}, interhash = {2e574e46b7668a7268e7f02b46f4d9bb}, intrahash = {2588419fae77ef64bd735f4265f7daa5}, publisher = {Cambridge University Press}, title = {Introduction to Information Retrieval}, url = {http://www-csli.stanford.edu/~hinrich/information-retrieval-book.html}, year = 2008 } @inproceedings{1390424, address = {New York, NY, USA}, author = {Schenkel, Ralf and Crecelius, Tom and Kacimi, Mouna and Michel, Sebastian and Neumann, Thomas and Parreira, Josiane X. and Weikum, Gerhard}, booktitle = {SIGIR '08: Proceedings of the 31st annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {http://doi.acm.org/10.1145/1390334.1390424}, interhash = {17f7d73a408830fa3a66504e49f7e481}, intrahash = {1c57b638bb3bee94300d883259774493}, isbn = {978-1-60558-164-4}, location = {Singapore, Singapore}, pages = {523--530}, privnote = {Online communities have become popular for publishing and searching content, as well as for finding and connecting to other users. User-generated content includes, for example, personal blogs, bookmarks, and digital photos. These items can be annotated and rated by different users, and these social tags and derived user-specific scores can be leveraged for searching relevant content and discovering subjectively interesting items. Moreover, the relationships among users can also be taken into consideration for ranking search results, the intuition being that you trust the recommendations of your close friends more than those of your casual acquaintances. Queries for tag or keyword combinations that compute and rank the top-k results thus face a large variety of options that complicate the query processing and pose efficiency challenges. This paper addresses these issues by developing an incremental top-k algorithm with two-dimensional expansions: social expansion considers the strength of relations among users, and semantic expansion considers the relatedness of different tags. It presents a new algorithm, based on principles of threshold algorithms, by folding friends and related tags into the search space in an incremental on-demand manner. The excellent performance of the method is demonstrated by an experimental evaluation on three real-world datasets, crawled from deli.cio.us, Flickr, and LibraryThing.}, publisher = {ACM}, title = {Efficient top-k querying over social-tagging networks}, url = {http://portal.acm.org/citation.cfm?id=1390424}, year = 2008 } @article{1377474, abstract = {Many Web sites have begun allowing users to submit items to a collection and tag them with keywords. The folksonomies built from these tags are an interesting topic that has seen little empirical research. This study compared the search information retrieval (IR) performance of folksonomies from social bookmarking Web sites against search engines and subject directories. Thirty-four participants created 103 queries for various information needs. Results from each IR system were collected and participants judged relevance. Folksonomy search results overlapped with those from the other systems, and documents found by both search engines and folksonomies were significantly more likely to be judged relevant than those returned by any single IR system type. The search engines in the study had the highest precision and recall, but the folksonomies fared surprisingly well. Del.icio.us was statistically indistinguishable from the directories in many cases. Overall the directories were more precise than the folksonomies but they had similar recall scores. Better query handling may enhance folksonomy IR performance further. The folksonomies studied were promising, and may be able to improve Web search performance.}, address = {Tarrytown, NY, USA}, author = {Morrison, P. Jason}, doi = {http://dx.doi.org/10.1016/j.ipm.2007.12.010}, interhash = {41f042c033417e4dbb9e48b76521363f}, intrahash = {7e1dc3f52085093cc33d8fe931253b34}, issn = {0306-4573}, journal = {Inf. Process. Manage.}, number = 4, pages = {1562--1579}, publisher = {Pergamon Press, Inc.}, title = {Tagging and searching: Search retrieval effectiveness of folksonomies on the World Wide Web}, url = {http://portal.acm.org/citation.cfm?id=1377474}, volume = 44, year = 2008 } @article{Smeulders00CBIR, address = {Washington, DC, USA}, author = {Smeulders, Arnold W. M. and Worring, Marcel and Santini, Simone and Gupta, Amarnath and Jain, Ramesh}, citeulike-article-id = {942093}, doi = {10.1109/34.895972}, interhash = {ead44ca34c9a120a17c6d1cb757d3b8d}, intrahash = {ff99ff85fdc2224d826dab75df21cf0d}, issn = {0162-8828}, journal = {IEEE Trans. Pattern Anal. Mach. Intell.}, month = {December}, number = 12, pages = {1349--1380}, posted-at = {2008-04-13 17:14:20}, priority = {2}, publisher = {IEEE Computer Society}, title = {Content-Based Image Retrieval at the End of the Early Years}, url = {http://portal.acm.org/citation.cfm?id=357873}, volume = 22, year = 2000 } @article{datta2008, abstract = {We have witnessed great interest and a wealth of promise in content-based image retrieval as an emerging technology. While the last decade laid foundation to such promise, it also paved the way for a large number of new techniques and systems, got many new people involved, and triggered stronger association of weakly related fields. In this paper, we survey almost 300 key theoretical and empirical contributions in the current decade related to image retrieval and automatic image annotation, and discuss the spawning of related sub-fields in the process. We also discuss significant challenges involved in the adaptation of existing image retrieval techniques to build systems that can be useful in the real-world. In retrospect of what has been achieved so far, we also conjecture what the future may hold for image retrieval research.}, author = {Datta, Ritendra and Joshi, Dhiraj and Li, Jia and Wang, James Z.}, interhash = {6f908ace6a3d5135960dc663d6335922}, intrahash = {278a48194bc9afbd298c36dd497a9821}, journal = {ACM Computing Surveys}, number = 2, title = {Image Retrieval: Ideas, Influences, and Trends of the New Age}, url = {http://infolab.stanford.edu/~wangz/project/imsearch/review/JOUR/}, volume = 40, year = 2008 } @inproceedings{ducrou2006FCA, author = {Ducrou, J. and Vormbrock, B. and Eklund, P. W.}, booktitle = {Proc. of the 14 th Int. Conference on Conceptual Structures}, interhash = {945317e6fad27eff08082ef38cc01dff}, intrahash = {be881638f77ac9d0b4e8434f0b3385f6}, location = {LNAI}, publisher = {Springer-Verlag}, title = {{FCA}-based Browsing and Searching of a Collection of Images}, year = 2006 } @article{hotho2007mining, author = {Hotho, Andreas and Stumme, Gerd}, interhash = {26915a205b66368931821165ecaf972c}, intrahash = {92d3a5fdd786086fa12787e3e350b6af}, journal = {Künstliche Intelligenz}, number = 3, pages = {5-8}, title = {Mining the World Wide Web}, url = {http://kobra.bibliothek.uni-kassel.de/bitstream/urn:nbn:de:hebis:34-2008021320337/3/HothoStummeMiningWWW.pdf}, vgwort = {20}, year = 2007 } @inproceedings{conf/www/SarwarKKR01, author = {Sarwar, Badrul M. and Karypis, George and Konstan, Joseph A. and Riedl, John}, booktitle = {WWW}, ee = {http://doi.acm.org/10.1145/371920.372071}, interhash = {043d1aaba0f0b8c01d84edd517abedaf}, intrahash = {f349b429624935212ebeed613b89794f}, pages = {285-295}, title = {Item-based collaborative filtering recommendation algorithms.}, url = {http://www10.org/cdrom/papers/pdf/p519.pdf}, year = 2001 } @article{974906, abstract = {Users of online search engines often find it difficult to express their need for information in the form of a query. However, if the user can identify examples of the kind of documents they require then they can employ a technique known as relevance feedback. Relevance feedback covers a range of techniques intended to improve a user's query and facilitate retrieval of information relevant to a user's information need. In this paper we survey relevance feedback techniques. We study both automatic techniques, in which the system modifies the user's query, and interactive techniques, in which the user has control over query modification. We also consider specific interfaces to relevance feedback systems and characteristics of searchers that can affect the use and success of relevance feedback systems.}, address = {New York, NY, USA}, author = {Ruthven, Ian and Lalmas, Mounia}, doi = {http://dx.doi.org/10.1017/S0269888903000638}, interhash = {965654547515dda5b340f80c41718ca4}, intrahash = {0357d6b4d3aa885a0978036d50136373}, issn = {0269-8889}, journal = {Knowl. Eng. Rev.}, number = 2, pages = {95--145}, publisher = {Cambridge University Press}, title = {A survey on the use of relevance feedback for information access systems}, url = {http://personal.cis.strath.ac.uk/~ir/papers/ker.pdf}, volume = 18, year = 2003 } @article{wiley1991, address = {Department of Information Studies, University of Sheffield, Western Bank, Sheffield S10 2TN, United Kingdom}, author = {Peat, Helen J. and Willett, Peter}, doi = {10.1002/(SICI)1097-4571(199106)42:5<378::AID-ASI8>3.0.CO;2-8}, interhash = {b3cbc3b9fe13100408c5ef4c2c175b01}, intrahash = {96cc916a85ed034ea7fa72e4259b9f05}, journal = {Journal of the American Society for Information Science}, number = 5, pages = {378-383}, publisher = {Copyright © 1991 John Wiley & Sons, Inc.}, title = {The limitations of term co-occurrence data for query expansion in document retrieval systems}, url = {http://www.iro.umontreal.ca/~nie/IFT6255/Peat_Willett_QExp.pdf}, volume = 42, year = 1991 } @inproceedings{hotho2006information, address = {Budva, Montenegro}, author = {Hotho, Andreas and Jäschke, Robert and Schmitz, Christoph and Stumme, Gerd}, booktitle = {Proceedings of the 3rd European Semantic Web Conference }, interhash = {10ec64d80b0ac085328a953bb494fb89}, intrahash = {7da1127fc4836e2cf58e3073f1b888b2}, isbn = {3-540-34544-2}, month = {June}, pages = {411-426}, publisher = {Springer}, series = {LNCS}, title = {Information Retrieval in Folksonomies: Search and Ranking}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2006/seach2006hotho_eswc.pdf}, vgwort = {29}, volume = 4011, year = 2006 } @article{bloehdorn2006intro, author = {Bloehdorn, S. and Buntine, W. and Hotho, A.}, date = {(2006)}, editor = {Bloehdorn, S. and Buntine, W. and Hotho, A.}, institution = {An International Journal of Computing and Informatics}, interhash = {4a3316f2f66fdfcb45c89bb872a82400}, intrahash = {e434232b8e3b80ff3b95006432fe54ee}, issn = {0350-5596}, journal = {Informatica}, number = 2, pages = {141-141}, title = {Introduction to the Special Issue 'Learning in Web Search'}, url = {http://www.informatica.si/PDF/30-2/00_Introduction.pdf}, volume = 30, year = 2006 } @proceedings{2005-lws-proceedings, editor = {Bloehdorn, Stephan and Buntine, Wray and Hotho, Andreas}, interhash = {940bd83a2753b14159690fde387e3909}, intrahash = {2de98c2b635f36c137e25256e8c235e0}, month = AUG, note = {Workshop at the 22nd International Conference on Machine Learning (ICML 2005) }, title = {Proceedings of the Workshop on Learning in Web Search (LWS 2005) }, url = {http://cosco.hiit.fi/search/learninginsearch05/ICML_W4.pdf}, year = 2005 } @article{984322, abstract = {Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and to then rank documents by the likelihood of the query according to the estimated language model. A central issue in language model estimation is smoothing, the problem of adjusting the maximum likelihood estimator to compensate for data sparseness. In this article, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collections. Experimental results show that not only is the retrieval performance generally sensitive to the smoothing parameters, but also the sensitivity pattern is affected by the query type, with performance being more sensitive to smoothing for verbose queries than for keyword queries. Verbose queries also generally require more aggressive smoothing to achieve optimal performance. This suggests that smoothing plays two different role---to make the estimated document language model more accurate and to "explain" the noninformative words in the query. In order to decouple these two distinct roles of smoothing, we propose a two-stage smoothing strategy, which yields better sensitivity patterns and facilitates the setting of smoothing parameters automatically. We further propose methods for estimating the smoothing parameters automatically. Evaluation on five different databases and four types of queries indicates that the two-stage smoothing method with the proposed parameter estimation methods consistently gives retrieval performance that is close to---or better than---the best results achieved using a single smoothing method and exhaustive parameter search on the test data.}, address = {New York, NY, USA}, author = {Zhai, Chengxiang and Lafferty, John}, doi = {http://doi.acm.org/10.1145/984321.984322}, interhash = {4d0acc84788713f07adbe0df3adc92d8}, intrahash = {c7aff853599cdde58a1d27eff4ede314}, issn = {1046-8188}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {179--214}, publisher = {ACM Press}, title = {A study of smoothing methods for language models applied to information retrieval}, url = {http://portal.acm.org/citation.cfm?id=984322}, volume = 22, year = 2004 } @inproceedings{ducrou2006FCA, booktitle = {Proc. of the 14 th Int. Conference on Conceptual Structures}, editor = {Ducrou, J. and Vormbrock, B. and Eklund, P. W.}, interhash = {945317e6fad27eff08082ef38cc01dff}, intrahash = {e614719477302971ab0acf6b4b95ea96}, location = {LNAI}, publisher = {Springer-Verlag}, title = {FCA-based Browsing and Searching of a Collection of Images}, year = 2006 } @inproceedings{salton1988spreading, abstract = {Spreading activation methods have been recommended in information retrieval to expand the search vocabulary and to complement the retrieved document sets. The spreading activation strategy is reminiscent of earlier associative indexing and retrieval systems. Some spreading activation procedures are briefly described, and evaluation output is given, reflecting the effectiveness of one of the proposed procedures.}, address = {New York, NY, USA}, author = {Salton, G. and Buckley, C.}, booktitle = {SIGIR '88: Proceedings of the 11th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {http://doi.acm.org/10.1145/62437.62447}, interhash = {7fdb31627e1a45ce109c7245fb6462b9}, intrahash = {994aef0486e69095ee0d8ba5b3e3a91c}, isbn = {2-7061-0309-4}, location = {Grenoble, France}, pages = {147--160}, privnote = {- ir is better than spreading activation - formal evaluation is provided - better modell is needed - no relation to folksonomies}, publisher = {ACM Press}, title = {On the use of spreading activation methods in automatic information}, url = {http://portal.acm.org/citation.cfm?id=62447&dl=ACM&coll=GUIDE}, year = 1988 } @article{crestani1997spreading, abstract = {This paper surveys the use of Spreading Activation techniques onSemantic Networks in Associative Information Retrieval. The majorSpreading Activation models are presented and their applications toIR is surveyed. A number of works in this area are criticallyanalyzed in order to study the relevance of Spreading Activation forassociative IR. ER -}, author = {Crestani, F.}, interhash = {3dfe398bb588335ffc562088d5a509de}, intrahash = {c26c16e0a8036000b788fada656f59dd}, journal = {Artificial Intelligence Review}, month = {December}, number = 6, pages = {453--482}, privnote = {- semantic network is very similar to the semantic web view - nice overview of old IR literatur in this area - the data problem is very promenent for the needed background knowledge - }, title = {Application of Spreading Activation Techniques in Information Retrieval}, url = {http://dx.doi.org/10.1023/A:1006569829653}, volume = 11, year = 1997 } @misc{eigenvector2005Langville, abstract = {Web information retrieval is significantly more challenging than traditional well-controlled, small document collection information retrieval. One main difference between traditional information retrieval and Web information retrieval is the Web’s hyperlink structure. This structure has been exploited by several of today’s leading Web search engines, particularly Google and Teoma. In this survey paper, we focus on Web information retrieval methods that use eigenvector computations, presenting the three popular methods of HITS, PageRank, and SALSA.}, author = {Langville, Amy N. and Meyer, Carl D.}, interhash = {d457071e1f5270c3d50cbb3243546833}, intrahash = {445172dea700200486177842e9dfe3cb}, journal = {The SIAM Review}, number = 1, pages = {135-161}, title = {A Survey of Eigenvector Methods of Web Information Retrieval}, url = {http://www.cofc.edu/~langvillea/surveyEVwebIRReprint.pdf}, volume = 47, year = 2005 } @inproceedings{lang95newsweeder, author = {Lang, Ken}, booktitle = {Proceedings of the 12th International Conference on Machine Learning}, interhash = {e64ed50bf2d9ceb44e38ec59c0947207}, intrahash = {b738abb5a0f2cae47e8f0633460c69a3}, pages = {331--339}, publisher = {Morgan Kaufmann publishers Inc.: San Mateo, CA, USA}, title = {News{W}eeder: learning to filter netnews}, url = {http://citeseer.ist.psu.edu/lang95newsweeder.html}, year = 1995 } @inproceedings{conf/www/SarwarKKR01, author = {Sarwar, Badrul M. and Karypis, George and Konstan, Joseph A. and Riedl, John}, booktitle = {WWW}, ee = {http://doi.acm.org/10.1145/371920.372071}, interhash = {043d1aaba0f0b8c01d84edd517abedaf}, intrahash = {f349b429624935212ebeed613b89794f}, pages = {285-295}, title = {Item-based collaborative filtering recommendation algorithms.}, url = {http://dblp.uni-trier.de/db/conf/www/www2001.html#SarwarKKR01}, year = 2001 } @book{sparck-jones+willett97, editor = {Sparck-Jones, K. and Willett, P.}, interhash = {9316414517263caed642f5f72e864033}, intrahash = {887f48afcb0104e36e7d755b204cc659}, key = {sparck-jones+willett97}, label = {Readings in Information Retrieval}, publisher = {Morgan Kaufmann}, title = {Readings in Information Retrieval}, type = {Book}, year = 1997 } @book{rijsbergen79information, address = {London}, author = {van Rijsbergen, C. J.}, edition = 2, interhash = {0edccdac9af024f458911b82f61686ab}, intrahash = {b53893655b48140d4310a848dbf204d3}, publisher = {Butterworths}, title = {Information retrieval}, url = {http://www.dcs.gla.ac.uk/Keith/Preface.html}, year = 1979 } @book{ferber2003information, address = {Heidelberg}, author = {Ferber, Reginald}, interhash = {52c1b4ab3e818efef6635eb76b778608}, intrahash = {b60dbc902a2e19877aec154fa5747751}, publisher = {dpunkt Verlag}, title = {Information Retrieval: Suchmodelle und Data-Mining-Verfahren für Textsammlungen und das Web}, url = {http://information-retrieval.de/}, year = 2003 } @book{books/aw/Baeza-YatesR99, author = {Baeza-Yates, Ricardo A. and Ribeiro-Neto, Berthier A.}, interhash = {6f78177742b3c836218aacfc7fc4c43c}, intrahash = {16ab70975f635f8d72de82e2ef3ef9de}, isbn = {0-201-39829-X}, publisher = {ACM Press / Addison-Wesley}, title = {Modern Information Retrieval}, url = {http://www.ischool.berkeley.edu/~hearst/irbook/glossary.html}, year = 1999 } @book{books/mk/WittenMB99, author = {Witten, Ian H. and Moffat, Alistair and Bell, Timothy C.}, interhash = {e27de62d31e9665025dc788cb30027d5}, intrahash = {99a44801fa131f7fda77e769791f5f78}, isbn = {1-55860-570-3}, publisher = {Morgan Kaufmann}, title = {Managing Gigabytes: Compressing and Indexing Documents and Images, Second Edition}, url = {http://www.cs.mu.oz.au/mg/}, year = 1999 } @inproceedings{conf/sigir/Voorhees94, author = {Voorhees, Ellen M.}, booktitle = {SIGIR}, cdrom = {SIGIR1994/P061.pdf}, ee = {db/conf/sigir/Voorhees94.html}, interhash = {4b351067b8aaf4953722cc6d5b494e46}, intrahash = {28bfb2f51bb8cb7bfb3473f051938fb0}, pages = {61-69}, title = {Query Expansion Using Lexical-Semantic Relations.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir94.html#Voorhees94}, year = 1994 } @inproceedings{conf/sigir/Voorhees94, author = {Voorhees, Ellen M.}, booktitle = {SIGIR}, cdrom = {SIGIR1994/P061.pdf}, ee = {db/conf/sigir/Voorhees94.html}, interhash = {4b351067b8aaf4953722cc6d5b494e46}, intrahash = {28bfb2f51bb8cb7bfb3473f051938fb0}, pages = {61-69}, title = {Query Expansion Using Lexical-Semantic Relations.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir94.html#Voorhees94}, year = 1994 } @inproceedings{1099726, address = {New York, NY, USA}, author = {Fonseca, Bruno M. and Golgher, Paulo and Pôssas, Bruno and Ribeiro-Neto, Berthier and Ziviani, Nivio}, booktitle = {CIKM '05: Proceedings of the 14th ACM international conference on Information and knowledge management}, doi = {http://doi.acm.org/10.1145/1099554.1099726}, interhash = {409a23c3d8b38cac0f1a7149b1507b0c}, intrahash = {8609bcd383c3f14bf4aac32dae8e970b}, isbn = {1-59593-140-6}, location = {Bremen, Germany}, pages = {696--703}, publisher = {ACM Press}, title = {Concept-based interactive query expansion}, year = 2005 } @book{sparck-jones+willett97, editor = {Sparck-Jones, K. and Willett, P.}, interhash = {9316414517263caed642f5f72e864033}, intrahash = {887f48afcb0104e36e7d755b204cc659}, key = {sparck-jones+willett97}, label = {Readings in Information Retrieval}, publisher = {Morgan Kaufmann}, title = {Readings in Information Retrieval}, type = {Book}, year = 1997 } @article{richter2005metadata, abstract = {During the last decade, the advance of machine-learning tools and algorithms has resulted in tremendous progress in the automated classification of documents. However, many classifiers base their classification decisions solely on document text and ignore metadata (such as authors, publication date, and author affiliation). In this project, automated classifiers using the k-Nearest Neighbour algorithm were developed for the classification of patents into two different classification systems. Those using metadata (in this case inventor names, applicant names and International Patent Classification codes) were compared with those ignoring it. The use of metadata could significantly improve the classification of patents with one classification system, improving classification accuracy from 70.8% up to 75.4%, which was highly statistically significant. However, the results for the other classification system were inconclusive: while metadata could improve the quality of the classifier for some experiments (recall increased from 66.0% to 68.9%, which was a small but nonetheless significant improvement), experiments with different parameters showed that it could also lead to a deterioration of quality (recall dropping as low as 61.0%). The study shows that metadata can play an extremely useful role in the classification of patents. Nonetheless, it must not be used indiscriminately but only after careful evaluation of its usefulness.}, author = {Richter and MacFarlane}, interhash = {c7749092c6e5a90cd43fe022fa398e0b}, intrahash = {d15595d5279e762207d67f2a9b688c37}, journal = {World Patent Information}, pages = {12-26}, title = {The impact of metadata on the accuracy of automated patent classification}, url = {http://dx.doi.org/10.1016/j.wpi.2004.08.001}, volume = 27, year = 2005 } @inproceedings{lang95newsweeder, author = {Lang, Ken}, booktitle = {Proceedings of the 12th International Conference on Machine Learning}, interhash = {e64ed50bf2d9ceb44e38ec59c0947207}, intrahash = {b738abb5a0f2cae47e8f0633460c69a3}, pages = {331--339}, publisher = {Morgan Kaufmann publishers Inc.: San Mateo, CA, USA}, title = {News{W}eeder: learning to filter netnews}, url = {http://citeseer.ist.psu.edu/lang95newsweeder.html}, year = 1995 }