@inproceedings{conf/wsdm/KohlschutterFN10, author = {Kohlschütter, Christian and Fankhauser, Peter and Nejdl, Wolfgang}, booktitle = {Proc. of 3rd ACM International Conference on Web Search and Data Mining New York City, NY USA (WSDM 2010).}, interhash = {25ea118166ef2f0d5597ca90fa702c9d}, intrahash = {dbc8464d9a298afa49d607d65f2160e2}, title = {Boilerplate Detection using Shallow Text Features}, year = 2010 } @inproceedings{Teufel01task-basedevaluation, abstract = {We present a novel method for task-based evalua- tion of summaries of scientific articles. The task we propose is a question-answering task, where the questions are about the relatedness of the current paper to prior research. This evaluation method is time-efficient with respect to material preparation and data collection, so that it is possible to test against many different baselines, something that is not usually feasible in evaluations by relevance decision. We use this methodology to evaluate the quality of summaries our system produces. These summaries are designed to describe the contribution of a scientific article in relation to other work. The re- sults show that this type of summary is indeed more useful than the baselines (random sentences, keyword lists and generic author-written summaries), and nearly as useful as the full texts.}, author = {Teufel, Simone}, booktitle = {In Workshop Automatic Summarization, NAACL}, interhash = {ed0c6de01aa6b0a3ef369627eb689cf3}, intrahash = {b95470c8eae5d8f0372d20215c35f236}, pages = {12--21}, title = {Task-Based Evaluation of Summary Quality: Describing Relationships between Scientific Papers}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.12.8139}, year = 2001 } @inproceedings{Kando99textstructure, abstract = {This paper describes an information retrieval system with the function to support user's use of the retrieved documents using the text-level structure of documents. The text-level structure of each document is described by the occurrence of typical functional components in the text. Automatic detection of the components has been attempted in previous works using surface-level language processing. The proposed system firstly utilizes the text structure to conduct high-precision searches of documents or passages by distinguishing the role or function each concept plays in the text. It also allows browsing or skimming of retrieved texts, creating summaries on-the-fly with various levels of condensation specified by the user. Moreover, the system can search and display any unit of a text such as a sentence, a paragraph or a chapter. Comparison of relevant passages in retrieved documents across multiple texts is helpful for users to examine, analyze, compare and integrate texts and...}, author = {Kando, Noriko}, booktitle = {IN PROCEEDINGS OF THE 4TH INTERNATIONAL WORKSHOP ON INFORMATION RETRIEVAL WITH ASIAN LANGUAGES}, interhash = {7cd913f37b21579ee636dc5036cca292}, intrahash = {94d62d9f3ef51731ea1edc03662616d9}, pages = {126--135}, title = {Text Structure Analysis as a Tool to Make Retrieved Documents Usable}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.3165}, year = 1999 } @article{cimiano05learning, author = {Cimiano, Philipp and Hotho, Andreas and Staab, Steffen}, ee = {http://www.jair.org/papers/paper1648.html}, interhash = {4c09568cff62babd362aab03095f4589}, intrahash = {eaaf0e4b3a8b29fab23b6c15ce2d308d}, journal = {Journal on Artificial Intelligence Research}, pages = {305-339}, title = {Learning Concept Hierarchies from Text Corpora using Formal Concept Analysis}, url = {http://dblp.uni-trier.de/db/journals/jair/jair24.html#CimianoHS05}, volume = 24, year = 2005 } @article{mann1988rhetorical, author = {Mann, William C and Thompson, Sandra A}, interhash = {e8feceeba43734d376da50554b8071d2}, intrahash = {8cedc5c82592cf1bbcfe5fa685fc5c67}, journal = {Text}, number = 3, pages = {243--281}, title = {Rhetorical structure theory: Toward a functional theory of text organization}, url = {http://scholar.google.com/scholar.bib?q=info:BEw8CIWbucoJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAU3X_1Dq4ULnWfFzMeRsqGJcha1fReMSl&scisf=4&hl=en}, volume = 8, year = 1988 } @article{Salton1996127, author = {Salton, Gerard and Allan, James and Singhal, Amit}, doi = {http://dx.doi.org/10.1016/S0306-4573(96)85001-1}, interhash = {2fc2920e08a6a7dda2c256e62fc1e349}, intrahash = {8674111d30a3c67d5d8a8b847cebb771}, issn = {0306-4573}, journal = {Information Processing & Management }, number = 2, pages = {127 - 138}, title = {Automatic text decomposition and structuring }, url = {http://www.sciencedirect.com/science/article/pii/S0306457396850011}, volume = 32, year = 1996 } @inproceedings{conf/birthday/BloehdornBCGHLMMSSV11, author = {Bloehdorn, Stephan and Blohm, Sebastian and Cimiano, Philipp and Giesbrecht, Eugenie and Hotho, Andreas and Lösch, Uta and Mädche, Alexander and Mönch, Eddie and Sorg, Philipp and Staab, Steffen and Völker, Johanna}, booktitle = {Foundations for the Web of Information and Services}, crossref = {conf/birthday/2011studer}, editor = {Fensel, Dieter}, ee = {http://dx.doi.org/10.1007/978-3-642-19797-0_7}, interhash = {db48314326a36fc4ac8770cba2c20e49}, intrahash = {21be5153a8f491c9f209d57ce7662387}, isbn = {978-3-642-19796-3}, pages = {115-142}, publisher = {Springer}, title = {Combining Data-Driven and Semantic Approaches for Text Mining.}, url = {http://dblp.uni-trier.de/db/conf/birthday/studer2011.html#BloehdornBCGHLMMSSV11}, year = 2011 } @inproceedings{conf/mldm/ToivonenVVBV01, author = {Toivonen, Jarmo and Visa, Ari and Vesanen, Tomi and Back, Barbro and Vanharanta, Hannu}, booktitle = {MLDM}, crossref = {conf/mldm/2001}, editor = {Perner, Petra}, ee = {http://dx.doi.org/10.1007/3-540-44596-X_15}, interhash = {2121b03b46ecdde012bae15ca8cf8ce6}, intrahash = {2f23db9219b4d693acf15d7401684499}, isbn = {3-540-42359-1}, pages = {184-195}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Validation of Text Clustering Based on Document Contents.}, url = {http://dblp.uni-trier.de/db/conf/mldm/mldm2001.html#ToivonenVVBV01}, volume = 2123, year = 2001 } @inproceedings{conf/icdm/YassineH10, author = {Yassine, Mohamed and Hajj, Hazem}, booktitle = {ICDM Workshops}, crossref = {conf/icdm/2010w}, editor = {Fan, Wei and Hsu, Wynne and Webb, Geoffrey I. and Liu, Bing and Zhang, Chengqi and Gunopulos, Dimitrios and Wu, Xindong}, ee = {http://dx.doi.org/10.1109/ICDMW.2010.75}, interhash = {72ae8c258d6559e4a90370453ecc2acc}, intrahash = {8b0afeee143cec94f3058c214ae38c6f}, pages = {1136-1142}, publisher = {IEEE Computer Society}, title = {A Framework for Emotion Mining from Text in Online Social Networks.}, url = {http://dblp.uni-trier.de/db/conf/icdm/icdmw2010.html#YassineH10}, year = 2010 } @misc{Rubin2011, abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }, author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark}, interhash = {e09d5d8587756d460a5d834025e75aac}, intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce}, note = {cite arxiv:1107.2462}, title = {Statistical Topic Models for Multi-Label Document Classification}, url = {http://arxiv.org/abs/1107.2462}, year = 2011 } @book{srivastava2009mining, abstract = {Giving a broad perspective of the field from numerous vantage points, 'Text Mining' focuses on statistical methods for text mining and analysis. It examines methods to automatically cluster and classify text documents and applies these methods in a variety of areas.}, address = {Boca Raton, FL}, author = {Srivastava, Asho and Sahami, Mehran.}, interhash = {290eabe518274b6fbcc73a106a7d52a6}, intrahash = {45ab79501c114299142864becfa6c841}, isbn = {9781420059403 1420059408}, pages = {--}, publisher = {CRC Press}, refid = {144226505}, title = {Text mining : classification, clustering, and applications}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9781420059403}, year = 2009 } @article{march06crane, author = {Crane, Gregory}, doi = {10.1045/march2006-crane}, interhash = {36d4825e3189d89195693d1449e9aaea}, intrahash = {eea7ae2ac1480c84f87544f2942c28f2}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March }, number = 3, title = {What Do You Do with a Million Books?}, url = {http://www.dlib.org/dlib/march06/crane/03crane.html}, volume = 12, year = 2006 } @article{Luo20091271, abstract = {Clustering is a very powerful data mining technique for topic discovery from text documents. The partitional clustering algorithms, such as the family of k-means, are reported performing well on document clustering. They treat the clustering problem as an optimization process of grouping documents into k clusters so that a particular criterion function is minimized or maximized. Usually, the cosine function is used to measure the similarity between two documents in the criterion function, but it may not work well when the clusters are not well separated. To solve this problem, we applied the concepts of neighbors and link, introduced in [S. Guha, R. Rastogi, K. Shim, ROCK: a robust clustering algorithm for categorical attributes, Information Systems 25 (5) (2000) 345-366], to document clustering. If two documents are similar enough, they are considered as neighbors of each other. And the link between two documents represents the number of their common neighbors. Instead of just considering the pairwise similarity, the neighbors and link involve the global information into the measurement of the closeness of two documents. In this paper, we propose to use the neighbors and link for the family of k-means algorithms in three aspects: a new method to select initial cluster centroids based on the ranks of candidate documents; a new similarity measure which uses a combination of the cosine and link functions; and a new heuristic function for selecting a cluster to split based on the neighbors of the cluster centroids. Our experimental results on real-life data sets demonstrated that our proposed methods can significantly improve the performance of document clustering in terms of accuracy without increasing the execution time much.}, author = {Luo, Congnan and Li, Yanjun and Chung, Soon M.}, doi = {10.1016/j.datak.2009.06.007}, interhash = {bf59c4cf26cbc35d6142630b34a66d37}, intrahash = {13483e90d8b46ef9435ec71473aacee4}, issn = {0169-023X}, journal = {Data & Knowledge Engineering}, note = {Including Special Section: Conference on Privacy in Statistical Databases (PSD 2008) - Six selected and extended papers on Database Privacy}, number = 11, pages = {1271 - 1288}, title = {Text document clustering based on neighbors}, url = {http://www.sciencedirect.com/science/article/B6TYX-4WNB4Y8-1/2/1dcd00d9c049988da53b44a526dd6555}, volume = 68, year = 2009 } @book{feldman2006mining, asin = {0521836573}, author = {Feldman, Ronen and Sanger, James}, dewey = {005.74}, ean = {9780521836579}, interhash = {14cb9e63c6dca7830675c73578dcc30b}, intrahash = {a310b253098a92c9f6352f568c1a9c37}, isbn = {0521836573}, publisher = {Cambridge University Press}, title = {The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data}, url = {http://www.amazon.com/Text-Mining-Handbook-Approaches-Unstructured/dp/0521836573/ref=sr_1_1?s=books&ie=UTF8&qid=1295265273&sr=1-1}, year = 2007 } @inproceedings{conf/sigir/HuFCZLYC08, author = {Hu, Jian and Fang, Lujun and Cao, Yang and Zeng, Hua-Jun and Li, Hua and Yang, Qiang and Chen, Zheng}, booktitle = {SIGIR}, crossref = {conf/sigir/2008}, date = {2008-07-27}, editor = {Myaeng, Sung-Hyon and Oard, Douglas W. and Sebastiani, Fabrizio and Chua, Tat-Seng and Leong, Mun-Kew}, ee = {http://doi.acm.org/10.1145/1390334.1390367}, interhash = {0a2878165034dcdfacb9045608ec482a}, intrahash = {76f863a12c0b983ec67682deaec1ada4}, isbn = {978-1-60558-164-4}, pages = {179-186}, publisher = {ACM}, title = {Enhancing text clustering by leveraging Wikipedia semantics.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2008.html#HuFCZLYC08}, year = 2008 } @article{wiley2008, address = {Department of Information Systems Engineering, Ben-Gurion University of the Negev, Beer-Sheva 84105, Israel; Department of Computer Science and Engineering, University of South Florida, Tampa, FL 33620}, author = {Markov, A. and Last, M. and Kandel, A.}, doi = {10.1002/int.20290}, interhash = {f56f7f1f800242cbbc9596d4f1ab889f}, intrahash = {512b4c9686e5d10763dc7258a10857b4}, journal = {International Journal of Intelligent Systems}, number = 6, pages = {654-679}, publisher = {Copyright © 2008 Wiley Periodicals, Inc., A Wiley Company}, title = {The hybrid representation model for web document classification}, url = {http://dx.doi.org/10.1002/int.20290}, volume = 23, year = 2008 } @article{carpena:035102, author = {Carpena, P. and Bernaola-Galv\'{a}n, P. and Hackenberg, M. and Coronado, A. V. and Oliver, J. L.}, doi = {10.1103/PhysRevE.79.035102}, eid = {035102}, interhash = {3444159872c65ea89d007d1838686acc}, intrahash = {34dcb1eee3ffa31ff4eb77087343c146}, journal = {Physical Review E (Statistical, Nonlinear, and Soft Matter Physics)}, number = 3, numpages = {4}, pages = 035102, publisher = {APS}, title = {Level statistics of words: Finding keywords in literary texts and symbolic sequences}, url = {http://bioinfo2.ugr.es/TextKeywords/}, volume = 79, year = 2009 } @book{UBMA_280507895, address = {Herdecke ; Bochum}, author = {Heyer, Gerhard and Quasthoff, Uwe and Wittig, Thomas}, edition = {1. korr. Nachdr.}, interhash = {d6fa152f7becd0a9d5155f748c29ac22}, intrahash = {692999b8760981d3b2e0b9103b9d3b0f}, isbn = {978-3-937137-30-8}, pages = {XII, 348 S.}, publisher = {W3L-Verl.}, series = {IT lernen}, title = {Text Mining: Wissensrohstoff Text}, url = {http://aleph.bib.uni-mannheim.de/F/?func=find-b&request=280507895&find_code=020&adjacent=N&local_base=MAN01PUBLIC&x=0&y=0}, year = 2008 } @incollection{bloehdorn2006learning, abstract = {Recent work has shown improvements in text clustering and classification tasks by integrating conceptual features extracted from ontologies. In this paper we present text mining experiments in the medical domain in which the ontological structures used are acquired automatically in an unsupervised learning process from the text corpus in question. We compare results obtained using the automatically learned ontologies with those obtained using manually engineered ones. Our results show that both types of ontologies improve results on text clustering and classification tasks, whereby the automatically acquired ontologies yield a improvement competitive with the manually engineered ones. ER -}, author = {Bloehdorn, Stephan and Cimiano, Philipp and Hotho, Andreas}, booktitle = {From Data and Information Analysis to Knowledge Engineering}, doi = {http://dx.doi.org/10.1007/3-540-31314-1_40}, interhash = {cf1af505b638677f00b3d3d7a5903199}, intrahash = {bc1d40cf4fd64780ecf712b1e40f31de}, isbn = {978-3-540-31313-7}, pages = {334--341}, publisher = {Springer Berlin Heidelberg}, title = {Learning Ontologies to Improve Text Clustering and Classification}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2006/2006-03-gfkl05-bloehdorn-etal-learning-ontologies.pdf}, year = 2006 } @article{colas2006behavior, abstract = {Document classification has already been widely studied. In fact, some studies compared feature selection techniques or feature space transformation whereas some others compared the performance of different algorithms. Recently, following the risinginterest towards the Support Vector Machine, various studies showed that the SVM outperforms other classification algorithms.So should we just not bother about other classification algorithms and opt always for SVM?}, author = {Colas, Fabrice and Brazdil, Pavel}, interhash = {327719a4d7d383ea20592e32e1cd7c50}, intrahash = {da96ccc48a52cf844e16de4b5a889313}, journal = {Text, Speech and Dialogue}, pages = {45--52}, title = {On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks}, url = {http://dx.doi.org/10.1007/11846406_6}, year = 2006 } @article{1324190, address = {New York, NY, USA}, author = {Stavrianou, Anna and Andritsos, Periklis and Nicoloyannis, Nicolas}, doi = {http://doi.acm.org/10.1145/1324185.1324190}, interhash = {bde58d2eeb65f2194171f93b0e1f2a21}, intrahash = {d8c54095392c0e83ab4f50f694d3b1f3}, issn = {0163-5808}, journal = {SIGMOD Rec.}, number = 3, pages = {23--34}, publisher = {ACM}, title = {Overview and semantic issues of text mining}, url = {http://portal.acm.org/citation.cfm?id=1324190}, volume = 36, year = 2007 } @article{blaz06ontogen, abstract = {In this paper we present a new version of OntoGen system for semi-automatic data-driven ontology construction. The system is based on a novel ontology learning framework which formalizes and extends the role of machine learning and text mining algorithms used in the previous version. List of new features includes extended number of supported ontology formats (RDFS and OWL), supervised methods for concept discovery (based on Active Learning), adding of new instances to ontology and improved user interface (based on comments from the users).}, author = {Fortuna, Blaz and Grobelnik, Marko and Mladenić, Dunja}, interhash = {91f979c3983afb0d8ef4e7c51e46c5aa}, intrahash = {fb694aa68ef84daf7556cd14b92bdc04}, location = {http://www.scientificcommons.org/17521109}, title = {Semi-automatic data-driven ontology construction system}, year = 2006 } @article{fortuna2005vtd, author = {Fortuna, B. and Grobelnik, M. and Mladeni{\'c}, D.}, interhash = {3db2674afa1f013492af950b6606b780}, intrahash = {cc348c82464832a3f6a165978973e179}, journal = {Special Issue: Hot Topics in European Agent Research I Guest Editors: Andrea Omicini}, pages = {497--502}, title = {{Visualization of text document corpus}}, volume = 29, year = 2005 } @inproceedings{PuWang:2007, abstract = {The exponential growth of text documents available on the Internet has created an urgent need for accurate, fast, and general purpose text classification algorithms. However, the "bag of words" representation used for these classification methods is often unsatisfactory as it ignores relationships between important terms that do not co-occur literally. In order to deal with this problem, we integrate background knowledge - in our application: Wikipedia - into the process of classifying text documents. The experimental evaluation on Reuters newsfeeds and several other corpus shows that our classification results with encyclopedia knowledge are much better than the baseline "bag of words " methods.}, author = {Wang, Pu and Hu, Jian and Zeng, Hua-Jun and Chen, Lijun and Chen, Zheng}, booktitle = {Data Mining, 2007. ICDM 2007. Seventh IEEE International Conference on}, doi = {10.1109/ICDM.2007.77}, interhash = {8a899b60047e20e162fc12b2ff6f8142}, intrahash = {66058efbca5abd1222f72c32365d23fa}, isbn = {978-0-7695-3018-5}, issn = {1550-4786}, pages = {332-341}, title = {Improving Text Classification by Using Encyclopedia Knowledge}, url = {ftp://ftp.computer.org/press/outgoing/proceedings/icdm07/Data/3018a332.pdf}, year = 2007 } @inproceedings{IfrimTW-ICML2005, address = {Bonn, Germany}, author = {Ifrim, Georgiana and Theobald, Martin and Weikum, Gerhard}, booktitle = {Proceedings of the 22nd International Conference on Machine Learning - Learning in Web Search (LWS 2005)}, editor = {Raedt, Luc De and Wrobel, Stefan}, interhash = {a54c4070e0fb55f5a084a0f088230a65}, intrahash = {57f8241941ed979455c3dbb90893020f}, isbn = {1-59593-180-5}, pages = {18--26}, title = {Learning Word-to-Concept Mappings for Automatic Text Classification}, url = {http://www.mpi-inf.mpg.de/~ifrim/publications/icml-lws05.pdf}, year = 2005 } @book{0387954333, asin = {0387954333}, author = {Weiss, Sholom M. and Indurkhya, Nitin and Zhang, T.}, dewey = {006.312}, ean = {9780387954332}, edition = 1, interhash = {d75b9da07cf40d54a79e6d8995f78a31}, intrahash = {6ac07561b543e6033fd4c9811d0dccad}, isbn = {0387954333}, publisher = {Springer, Berlin}, title = {Text Mining. Predictive Methods for Analyzing Unstructured Information}, url = {http://www.amazon.de/gp/redirect.html%3FASIN=0387954333%26tag=ws%26lcode=xm2%26cID=2025%26ccmID=165953%26location=/o/ASIN/0387954333%253FSubscriptionId=13CT5CVB80YFWJEPWS02}, year = 2004 } @inproceedings{baker98distributional, address = {Melbourne, AU}, author = {Baker, L. Douglas and McCallum, Andrew K.}, booktitle = {Proceedings of {SIGIR}-98, 21st {ACM} International Conference on Research and Development in Information Retrieval}, editor = {Croft, W. Bruce and Moffat, Alistair and van Rijsbergen, Cornelis J. and Wilkinson, Ross and Zobel, Justin}, interhash = {f116fa6b3ef1eefecb8bf27dfaa53ee7}, intrahash = {e472dc4e61921ed15175756fcd9fea6a}, pages = {96--103}, publisher = {ACM Press, New York, US}, title = {Distributional clustering of words for text classification}, url = {citeseer.ist.psu.edu/baker98distributional.html}, year = 1998 } @inproceedings{658040, address = {Washington, DC, USA}, author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen}, booktitle = {ICDM '01: Proceedings of the 2001 IEEE International Conference on Data Mining}, interhash = {e2f356aeefc84fd73c9bcdc08392edf0}, intrahash = {a6803e87c5145d5f55d7bb1bab8dfd67}, isbn = {0-7695-1119-8}, pages = {607--608}, publisher = {IEEE Computer Society}, title = {Text Clustering Based on Good Aggregations}, url = {http://portal.acm.org/citation.cfm?id=658040}, year = 2001 } @inproceedings{hotho_fgml02, author = {Hotho, A. and Stumme, G.}, booktitle = {Proceedings of FGML Workshop}, interhash = {3dd3d4ce38d0de0ba8e167f8133cbb3e}, intrahash = {18fdbebb76d48feccf2dceed23f4cd74}, pages = {37-45}, publisher = {Special Interest Group of German Informatics Society (FGML --- Fachgruppe Maschinelles Lernen der GI e.V.)}, title = {Conceptual Clustering of Text Clusters}, url = {\url{http://www.aifb.uni-karlsruhe.de/WBS/aho/pub/tc_fca_2002_submit.pdf}}, year = 2002 } @inproceedings{hotho02textws, author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen and Zacharias, Valentin}, booktitle = {Proc. of Text Mining Workshop}, interhash = {a8e24d64d26ca5b681a2c29e005cbcc2}, intrahash = {56ae0afc93999014629b06cc958f6a04}, title = {On Knowledgeable Unsupervised Text Mining }, url = {http://www.aifb.uni-karlsruhe.de/WBS/aho/pub/txt_mining_ws_2002.pdf}, year = 2002 } @inproceedings{cim04a, address = {Lisbon, Portugal}, author = {Cimiano, Philipp and Hotho, Andreas and Staab, Steffen}, booktitle = {Proceedings of the Conference on Languages Resources and Evaluation (LREC)}, interhash = {9374d126c328dab48f52854f73d6db4f}, intrahash = {3bc6e5a51dba862da1b7b3b6ac563370}, month = MAY, publisher = {ELRA - European Language Ressources Association}, title = {Clustering Ontologies from Text}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/lrec04.pdf}, year = 2004 } @inproceedings{lauser03, author = {Lauser, Boris and Hotho, Andreas}, booktitle = {Proc. of the 7th European Conference in Research and Advanced Technology for Digital Libraries, ECDL 2003}, interhash = {feb38928054a3691f83122b0172c5116}, intrahash = {8b298c325c6ecdb9c01e01057464ae2d}, pages = {140-151}, publisher = {Springer}, series = {LNCS}, title = {Automatic multi-label subject indexing in a multilingual environment}, volume = 2769, year = 2003 } @inproceedings{hotho_pkdd03, author = {Hotho, A. and Staab, S. and Stumme, G.}, booktitle = {Proc. of the 7th European Conference on Principles and Practice of Knowledge Discovery in Databases, PKDD}, interhash = {cf66183151a5d94a0941ac6d5089ae89}, intrahash = {c1bb26aa5d4801542f832ffab70c82e5}, pages = {217-228}, series = {LNCS}, title = {Explaining Text Clustering Results using Semantic Structures}, volume = 2838, year = 2003 } @incollection{bloehdorn2006boosting, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Advances in Web Mining and Web Usage Analysis}, citeulike-article-id = {910161}, doi = {10.1007/11899402_10}, interhash = {9351de43861fe23833c5d074059f310a}, intrahash = {e297ea25cc01678c35501a44115aea8d}, isbn = {978-3-540-47127-1}, issn = {0302-9743}, pages = {149--166}, priority = {2}, publisher = {Springer}, series = {LNCS}, title = {Boosting for Text Classification with Semantic Features}, url = {http://dx.doi.org/10.1007/11899402_10}, vgwort = {32}, volume = 3932, year = 2006 } @inproceedings{conf/iis/StaabH03, author = {Staab, Steffen and Hotho, Andreas}, booktitle = {Intelligent Information Processing and Web Mining, Proceedings of the International IIS: IIPWM'03 Conference held in Zakopane}, interhash = {dcb3c9710a44a43f9d8b17c5fc2b0f8c}, intrahash = {d773061117a913428968cc99c6e1ec0f}, isbn = {3-540-00843-8}, pages = {451-452}, title = {Ontology-based Text Document Clustering.}, url = {http://dblp.uni-trier.de/db/conf/iis/iis2003.html#StaabH03}, year = 2003 } @article{kostoff, abstract = {Literature-related discovery (LRD) is the linking of two or more literature concepts that have heretofore not been linked (i.e., disjoint), in order to produce novel, interesting, plausible, and intelligible knowledge (i.e., potential discovery). The open discovery systems (ODS) component of LRD starts with a problem to be solved, and generates solutions to that problem through potential discovery. We have been using ODS LRD to identify potential treatments or preventative actions for challenging medical problems, among myriad other applications. This paper describes the second medical problem we addressed (cataract) using ODS LRD; the first problem addressed was Raynaud's Phenomenon (RP), and was described in the third paper of this Special Issue. Cataract was selected because it is ubiquitous globally, appears intractable to all forms of treatment other than surgical removal of cataracts, and is a major cause of blindness in many developing countries. The ODS LRD study had three objectives: a) identify non-drug non-surgical treatments that would 1) help prevent cataracts, or 2) reduce the progression rate of cataracts, or 3) stop the progression of cataracts, or 4) maybe even reverse the progression of cataracts; b) demonstrate that we could solve an ODS LRD problem with no prior knowledge of any results or prior work (unlike the case with the RP problem); c) determine whether large time savings in the discovery process were possible relative to the time required for conducting the RP study. To that end, we used the MeSH taxonomy of MEDLINE to restrict potential discoveries to selected semantic classes, as a substitute for the manually-intensive process used in the RP study to restrict potential discoveries to selected semantic classes. We also used additional semantic filtering to identify potential discovery within the selected semantic classes. All these goals were achieved. As will be shown, we generated large amounts of potential discovery in more than an order of magnitude less time than required for the RP study. We identified many non-drug non-surgical treatments that may be able to reduce or even stop the progression rate of cataracts. Time, and much testing, will determine whether this is possible. Finally, the methodology has been developed to the point where ODS LRD problems can be solved with no results or knowledge of any prior work.}, author = {Kostoff, Ronald N.}, interhash = {45ce0cd73dd62182ce1e447ba9fe71eb}, intrahash = {b9359f79985da9b9677340ffda849e74}, journal = {Technological Forecasting and Social Change}, pages = {--}, title = {Literature-related discovery (LRD): Potential treatments for cataracts}, url = {http://www.sciencedirect.com/science/article/B6V71-4RDB8SC-9/2/8991fe8968a0ef12f22ed7e9ac9d7c4f}, volume = {In Press, Corrected Proof}, year = 2007 } @article{984322, abstract = {Language modeling approaches to information retrieval are attractive and promising because they connect the problem of retrieval with that of language model estimation, which has been studied extensively in other application areas such as speech recognition. The basic idea of these approaches is to estimate a language model for each document, and to then rank documents by the likelihood of the query according to the estimated language model. A central issue in language model estimation is smoothing, the problem of adjusting the maximum likelihood estimator to compensate for data sparseness. In this article, we study the problem of language model smoothing and its influence on retrieval performance. We examine the sensitivity of retrieval performance to the smoothing parameters and compare several popular smoothing methods on different test collections. Experimental results show that not only is the retrieval performance generally sensitive to the smoothing parameters, but also the sensitivity pattern is affected by the query type, with performance being more sensitive to smoothing for verbose queries than for keyword queries. Verbose queries also generally require more aggressive smoothing to achieve optimal performance. This suggests that smoothing plays two different role---to make the estimated document language model more accurate and to "explain" the noninformative words in the query. In order to decouple these two distinct roles of smoothing, we propose a two-stage smoothing strategy, which yields better sensitivity patterns and facilitates the setting of smoothing parameters automatically. We further propose methods for estimating the smoothing parameters automatically. Evaluation on five different databases and four types of queries indicates that the two-stage smoothing method with the proposed parameter estimation methods consistently gives retrieval performance that is close to---or better than---the best results achieved using a single smoothing method and exhaustive parameter search on the test data.}, address = {New York, NY, USA}, author = {Zhai, Chengxiang and Lafferty, John}, doi = {http://doi.acm.org/10.1145/984321.984322}, interhash = {4d0acc84788713f07adbe0df3adc92d8}, intrahash = {c7aff853599cdde58a1d27eff4ede314}, issn = {1046-8188}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {179--214}, publisher = {ACM Press}, title = {A study of smoothing methods for language models applied to information retrieval}, url = {http://portal.acm.org/citation.cfm?id=984322}, volume = 22, year = 2004 } @inproceedings{conf/icdm/PopesculULP03, author = {Popescul, Alexandrin and Ungar, Lyle H. and Lawrence, Steve and Pennock, David M.}, booktitle = {ICDM}, crossref = {conf/icdm/2003}, date = {2004-01-28}, ee = {http://csdl.computer.org/comp/proceedings/icdm/2003/1978/00/19780275abs.htm}, interhash = {3bcb76c6628b1752db555f86fe39429e}, intrahash = {7cdd6b0791fcdf17ec6d404b55f12c5c}, isbn = {0-7695-1978-4}, pages = {275-282}, publisher = {IEEE Computer Society}, title = {Statistical Relational Learning for Document Mining.}, url = {http://www.cis.upenn.edu/~popescul/Publications/popescul03dm.pdf}, year = 2003 } @inproceedings{feldman95KDT, author = {Feldman, R. and Dagan, I.}, booktitle = {Proc. of the First Int. Conf. on Knowledge Discovery (KDD)}, interhash = {15f076596b35048463f828687410ea30}, intrahash = {d1bb2e8dff9bd80da158b4b770685dce}, key = {feldman95KDT}, label = {KDT - Knowledge Discovery in Texts}, pages = {112-117}, title = {Knowledge Discovery in Textual Databases (KDT)}, type = {InProceedings}, year = 1995 } @inproceedings{sanderson99-deriving, author = {Sanderson, Mark and Croft, William Bruce}, booktitle = {Proceedings of the 22nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR'99}, interhash = {b351eb1a827b4d024323c4706035c938}, intrahash = {d15caaaea82b6df0747cc298a8b13556}, lastdatemodified = {2007-04-14}, lastname = {Sanderson}, own = {notown}, pages = {206--213}, pdf = {sanderson99-deriving.pdf}, read = {notread}, title = {Deriving concept hierarchies from text}, year = 1999 } @book{cimiano2006, address = {Secaucus, NJ, USA}, author = {Cimiano, Philipp}, interhash = {f8a70c22cfd162dc9ad2cd977d79b66c}, intrahash = {fdfff52cddb448c0094213aff5bcaf31}, isbn = {0387306323}, publisher = {Springer-Verlag New York, Inc.}, title = {Ontology Learning and Population from Text: Algorithms, Evaluation and Applications}, url = {http://portal.acm.org/citation.cfm?id=1177318}, year = 2006 } @inproceedings{steinbach00comparison, author = {Steinbach, M. and Karypis, G. and Kumar, V.}, booktitle = {KDD Workshop on Text Mining}, interhash = {3340fbf75ada2ccb45a50dd5832f5f07}, intrahash = {10e5c1e3ff54d9dce505a231f8ae7b32}, title = {A comparison of document clustering techniques}, url = {http://citeseer.ist.psu.edu/steinbach00comparison.html}, year = 2000 } @article{keyhere, abstract = {Concepts of complex networks have been used to obtain metrics that were correlated to text quality established by scores assigned by human judges. Texts produced by high-school students in Portuguese were represented as scale-free networks (word adjacency model), from which typical network features such as the in/outdegree, clustering coefficient and shortest path were obtained. Another metric was derived from the dynamics of the network growth, based on the variation of the number of connected components. The scores assigned by the human judges according to three text quality criteria (coherence and cohesion, adherence to standard writing conventions and theme adequacy/development) were correlated with the network measurements. Text quality for all three criteria was found to decrease with increasing average values of outdegrees, clustering coefficient and deviation from the dynamics of network growth. Among the criteria employed, cohesion and coherence showed the strongest correlation, which probably indicates that the network measurements are able to capture how the text is developed in terms of the concepts represented by the nodes in the networks. Though based on a particular set of texts and specific language, the results presented here point to potential applications in other instances of text analysis.}, author = {Antiqueira, L. and Nunes, M.G.V. and Jr., O.N. Oliveira and da F. Costa, L.}, interhash = {d3fb61cdf3d02e793dfb86d7eb30ae90}, intrahash = {8fb247044a9326fd0f4d222d4dd4b4d8}, journal = {Physica A: Statistical and Theoretical Physics}, month = {#jan#}, pages = {811--820}, title = {Strong correlations between text quality and complex networks features}, url = {http://www.sciencedirect.com/science/article/B6TVG-4KB1072-2/2/6555b4ec9893dbd30d280dbe22763fac}, volume = 373, year = 2007 } @misc{ieKey, author = {Boley, Daniel and Gini, Maria and Gross, Robert and Han, Eui-Hong (Sam) and Hastings, Kyle and Karypis, George and Kumar, Vipin and Mobasher, Bamshad and Moore, Jerome}, date = {1999}, interhash = {d544ef5463da700ac7209b61b5bc7eef}, intrahash = {1a1d7962e0dbc3b0afac99911db093e1}, journal = {To appear in Decision Support Systems Journal}, title = {"Partitioning-Based Clustering for Web Document Categorization}, year = 1999 } @inproceedings{775110, address = {New York, NY, USA}, author = {Beil, Florian and Ester, Martin and Xu, Xiaowei}, booktitle = {KDD '02: Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {http://doi.acm.org/10.1145/775047.775110}, interhash = {afbbcb8e9e77abf0eca425048f104a51}, intrahash = {1e335abdf44d287a375a6383683d1b98}, isbn = {1-58113-567-X}, location = {Edmonton, Alberta, Canada}, pages = {436--442}, publisher = {ACM Press}, title = {Frequent term-based text clustering}, year = 2002 } @article{Sebastiani02, author = {Sebastiani, F.}, bb-further-address = {--Dordrecht--London}, interhash = {d945d9218673dad37dc2a06cbf9e554c}, intrahash = {0fe0d5dd12c2cb59dfc330e684ec4b4a}, journal = {ACM Computing Surveys}, number = 1, pages = {1--47}, title = {Machine learning in automated text categorization}, url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf}, volume = 34, year = 2002 } @article{1131907, address = {New York, NY, USA}, author = {Tjhi, William-Chandra and Chen, Lihui}, doi = {http://dx.doi.org/10.1016/j.patrec.2005.07.012}, interhash = {1e398099e0879fba9a7b874a10a4fd20}, intrahash = {c5916b7f616900d5dbe103395450e83c}, issn = {0167-8655}, journal = {Pattern Recogn. Lett.}, number = 3, pages = {151--159}, publisher = {Elsevier Science Inc.}, title = {A partitioning based algorithm to fuzzy co-cluster documents and words}, volume = 27, year = 2006 } @inproceedings{conf/sigir/XuLG03, author = {Xu, Wei and Liu, Xin and Gong, Yihong}, booktitle = {SIGIR}, ee = {http://doi.acm.org/10.1145/860485}, interhash = {27211261d57f136a1116f2f58266fae7}, intrahash = {d1c6f4bdc7ed10778a573dcc395074d9}, pages = {267-273}, title = {Document clustering based on non-negative matrix factorization.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2003.html#XuLG03}, year = 2003 } @inproceedings{bloehdorn2004icdm, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Proceedings of the Fourth IEEE International Conference on Data Mining}, interhash = {f18089d50fdc9c9e38c4fc1e350bdd4e}, intrahash = {7df6357c79445d811f4a9223e688da14}, month = NOV, pages = {331-334}, publisher = {IEEE Computer Society Press}, title = {Text Classification by Boosting Weak Learners based on Terms and Concepts}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/icdm04boosting.pdf}, year = 2004 } @inproceedings{hotho-ijcaiws2001, author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen}, booktitle = {Proc. of the Workshop ``Text Learning: Beyond Supervision'' at IJCAI 2001. Seattle, WA, USA, August 6, 2001}, interhash = {cce452c6e28cb3cfb99b3416457f1b24}, intrahash = {15a8cfc9a49b70812a55c76e597db7a5}, publihser = {Springer}, title = {Ontology-based Text Clustering}, year = 2001 }