@inproceedings{conf/mldm/ToivonenVVBV01, author = {Toivonen, Jarmo and Visa, Ari and Vesanen, Tomi and Back, Barbro and Vanharanta, Hannu}, booktitle = {MLDM}, crossref = {conf/mldm/2001}, editor = {Perner, Petra}, ee = {http://dx.doi.org/10.1007/3-540-44596-X_15}, interhash = {2121b03b46ecdde012bae15ca8cf8ce6}, intrahash = {2f23db9219b4d693acf15d7401684499}, isbn = {3-540-42359-1}, pages = {184-195}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Validation of Text Clustering Based on Document Contents.}, url = {http://dblp.uni-trier.de/db/conf/mldm/mldm2001.html#ToivonenVVBV01}, volume = 2123, year = 2001 } @article{Luo20091271, abstract = {Clustering is a very powerful data mining technique for topic discovery from text documents. The partitional clustering algorithms, such as the family of k-means, are reported performing well on document clustering. They treat the clustering problem as an optimization process of grouping documents into k clusters so that a particular criterion function is minimized or maximized. Usually, the cosine function is used to measure the similarity between two documents in the criterion function, but it may not work well when the clusters are not well separated. To solve this problem, we applied the concepts of neighbors and link, introduced in [S. Guha, R. Rastogi, K. Shim, ROCK: a robust clustering algorithm for categorical attributes, Information Systems 25 (5) (2000) 345-366], to document clustering. If two documents are similar enough, they are considered as neighbors of each other. And the link between two documents represents the number of their common neighbors. Instead of just considering the pairwise similarity, the neighbors and link involve the global information into the measurement of the closeness of two documents. In this paper, we propose to use the neighbors and link for the family of k-means algorithms in three aspects: a new method to select initial cluster centroids based on the ranks of candidate documents; a new similarity measure which uses a combination of the cosine and link functions; and a new heuristic function for selecting a cluster to split based on the neighbors of the cluster centroids. Our experimental results on real-life data sets demonstrated that our proposed methods can significantly improve the performance of document clustering in terms of accuracy without increasing the execution time much.}, author = {Luo, Congnan and Li, Yanjun and Chung, Soon M.}, doi = {10.1016/j.datak.2009.06.007}, interhash = {bf59c4cf26cbc35d6142630b34a66d37}, intrahash = {13483e90d8b46ef9435ec71473aacee4}, issn = {0169-023X}, journal = {Data & Knowledge Engineering}, note = {Including Special Section: Conference on Privacy in Statistical Databases (PSD 2008) - Six selected and extended papers on Database Privacy}, number = 11, pages = {1271 - 1288}, title = {Text document clustering based on neighbors}, url = {http://www.sciencedirect.com/science/article/B6TYX-4WNB4Y8-1/2/1dcd00d9c049988da53b44a526dd6555}, volume = 68, year = 2009 } @inproceedings{conf/sigir/HuFCZLYC08, author = {Hu, Jian and Fang, Lujun and Cao, Yang and Zeng, Hua-Jun and Li, Hua and Yang, Qiang and Chen, Zheng}, booktitle = {SIGIR}, crossref = {conf/sigir/2008}, date = {2008-07-27}, editor = {Myaeng, Sung-Hyon and Oard, Douglas W. and Sebastiani, Fabrizio and Chua, Tat-Seng and Leong, Mun-Kew}, ee = {http://doi.acm.org/10.1145/1390334.1390367}, interhash = {0a2878165034dcdfacb9045608ec482a}, intrahash = {76f863a12c0b983ec67682deaec1ada4}, isbn = {978-1-60558-164-4}, pages = {179-186}, publisher = {ACM}, title = {Enhancing text clustering by leveraging Wikipedia semantics.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2008.html#HuFCZLYC08}, year = 2008 } @inproceedings{hotho03ontologies, address = {Melbourne, Florida}, author = {Hotho, Andreas and Staab, Steffen and Stumme, Gerd}, booktitle = {Proceedings of the 2003 IEEE International Conference on Data Mining}, comment = {alpha}, interhash = {b56c36d6d9c9ca9e6bd236a0f92415a5}, intrahash = {57a39c81cff1982dbefed529be934bee}, month = {November 19-22,}, pages = {541-544 (Poster}, publisher = {IEEE {C}omputer {S}ociety}, title = {Ontologies improve text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003ontologies.pdf}, year = 2003 } @techreport{hotho03textclustering, abstract = {Text document clustering plays an important role in providing intuitive navigation and browsing mechanisms by organizing large amounts of information into a small number of meaningful clusters. Standard partitional or agglomerative clustering methods efficiently compute results to this end. However, the bag of words representation used for these clustering methods is often unsatisfactory as it ignores relationships between important terms that do not co-occur literally. Also, it is mostly left to the user to find out why a particular partitioning has been achieved, because it is only specified extensionally. In order to deal with the two problems, we integrate background knowledge into the process of clustering text documents. First, we preprocess the texts, enriching their representations by background knowledge provided in a core ontology — in our application Wordnet. Then, we cluster the documents by a partitional algorithm. Our experimental evaluation on Reuters newsfeeds compares clustering results with pre-categorizations of news. In the experiments, improvements of results by background knowledge compared to the baseline can be shown for many interesting tasks. Second, the clustering partitions the large number of documents to a relatively small number of clusters, which may then be analyzed by conceptual clustering. In our approach, we applied Formal Concept Analysis. Conceptual clustering techniques are known to be too slow for directly clustering several hundreds of documents, but they give an intensional account of cluster results. They allow for a concise description of commonalities and distinctions of different clusters. With background knowledge they even find abstractions like “food” (vs. specializations like “beef” or “corn”). Thus, in our approach, partitional clustering reduces first the size of the problem such that it becomes tractable for conceptual clustering, which then facilitates the understanding of the results.}, author = {Hotho, Andreas and Staab, Steffen and Stumme, Gerd}, comment = {alpha}, institution = {University of Karlsruhe, Institute AIFB}, interhash = {0bc7c3fc1273355f45c8970a7ea58f97}, intrahash = {61d58db419af0dbc3681432588219c3d}, title = {Text Clustering Based on Background Knowledge}, type = {Technical Report }, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003text.pdf}, volume = 425, year = 2003 } @inproceedings{hotho02conceptualclustering, author = {Hotho, A. and Stumme, G.}, booktitle = {Proc. Fachgruppentreffen Maschinelles Lernen (FGML 2002)}, comment = {alpha}, editor = {K\'okai, G. and Zeidler, J.}, interhash = {3dd3d4ce38d0de0ba8e167f8133cbb3e}, intrahash = {e253c44552a046fe90236274bcfeab13}, pages = {37-45}, title = {Conceptual Clustering of Text Clusters}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2002/FGML02.pdf}, year = 2002 } @inproceedings{hotho03wordnet, address = {Toronto}, author = {Hotho, A and Staab, S. and Stumme, G.}, booktitle = {Proc. SIGIR Semantic Web Workshop}, comment = {alpha}, interhash = {c2a9a89ce20cef90a1e78d34dc2c2afe}, intrahash = {04c7d86337d68e4ed9ae637029c43414}, title = {Wordnet improves text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003wordnet.pdf}, year = 2003 } @inproceedings{hotho03explaining, abstract = {Common text clustering techniques offer rather poor capabilities for explaining to their users why a particular result has been achieved. They have the disadvantage that they do not relate semantically nearby terms and that they cannot explain how resulting clusters are related to each other. In this paper, we discuss a way of integrating a large thesaurus and the computation of lattices of resulting clusters into common text clustering in order to overcome these two problems. As its major result, our approach achieves an explanation using an appropriate level of granularity at the concept level as well as an appropriate size and complexity of the explaining lattice of resulting clusters.}, address = {Heidelberg}, author = {Hotho, Andreas and Staab, Steffen and Stumme, Gerd}, booktitle = {Knowledge Discovery in Databases: PKDD 2003, 7th European Conference on Principles and Practice of Knowledge Discovery in Databases}, comment = {alpha}, editor = {Lavra\v{c}, Nada and Gamberger, Dragan and Todorovski, Hendrik BlockeelLjupco}, interhash = {cf66183151a5d94a0941ac6d5089ae89}, intrahash = {53a943b6be4b34cf4e5329d0b58e99f6}, pages = {217-228}, publisher = {Springer}, series = {LNAI}, title = {Explaining Text Clustering Results using Semantic Structures}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003explaining.pdf}, volume = 2838, year = 2003 } @incollection{bloehdorn2006learning, abstract = {Recent work has shown improvements in text clustering and classification tasks by integrating conceptual features extracted from ontologies. In this paper we present text mining experiments in the medical domain in which the ontological structures used are acquired automatically in an unsupervised learning process from the text corpus in question. We compare results obtained using the automatically learned ontologies with those obtained using manually engineered ones. Our results show that both types of ontologies improve results on text clustering and classification tasks, whereby the automatically acquired ontologies yield a improvement competitive with the manually engineered ones. ER -}, author = {Bloehdorn, Stephan and Cimiano, Philipp and Hotho, Andreas}, booktitle = {From Data and Information Analysis to Knowledge Engineering}, doi = {http://dx.doi.org/10.1007/3-540-31314-1_40}, interhash = {cf1af505b638677f00b3d3d7a5903199}, intrahash = {bc1d40cf4fd64780ecf712b1e40f31de}, isbn = {978-3-540-31313-7}, pages = {334--341}, publisher = {Springer Berlin Heidelberg}, title = {Learning Ontologies to Improve Text Clustering and Classification}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2006/2006-03-gfkl05-bloehdorn-etal-learning-ontologies.pdf}, year = 2006 } @inproceedings{baker98distributional, address = {Melbourne, AU}, author = {Baker, L. Douglas and McCallum, Andrew K.}, booktitle = {Proceedings of {SIGIR}-98, 21st {ACM} International Conference on Research and Development in Information Retrieval}, editor = {Croft, W. Bruce and Moffat, Alistair and van Rijsbergen, Cornelis J. and Wilkinson, Ross and Zobel, Justin}, interhash = {f116fa6b3ef1eefecb8bf27dfaa53ee7}, intrahash = {e472dc4e61921ed15175756fcd9fea6a}, pages = {96--103}, publisher = {ACM Press, New York, US}, title = {Distributional clustering of words for text classification}, url = {citeseer.ist.psu.edu/baker98distributional.html}, year = 1998 } @inproceedings{658040, address = {Washington, DC, USA}, author = {Hotho, Andreas and Maedche, Alexander and Staab, Steffen}, booktitle = {ICDM '01: Proceedings of the 2001 IEEE International Conference on Data Mining}, interhash = {e2f356aeefc84fd73c9bcdc08392edf0}, intrahash = {a6803e87c5145d5f55d7bb1bab8dfd67}, isbn = {0-7695-1119-8}, pages = {607--608}, publisher = {IEEE Computer Society}, title = {Text Clustering Based on Good Aggregations}, url = {http://portal.acm.org/citation.cfm?id=658040}, year = 2001 } @inproceedings{hotho_fgml02, author = {Hotho, A. and Stumme, G.}, booktitle = {Proceedings of FGML Workshop}, interhash = {3dd3d4ce38d0de0ba8e167f8133cbb3e}, intrahash = {18fdbebb76d48feccf2dceed23f4cd74}, pages = {37-45}, publisher = {Special Interest Group of German Informatics Society (FGML --- Fachgruppe Maschinelles Lernen der GI e.V.)}, title = {Conceptual Clustering of Text Clusters}, url = {\url{http://www.aifb.uni-karlsruhe.de/WBS/aho/pub/tc_fca_2002_submit.pdf}}, year = 2002 } @inproceedings{cim04a, address = {Lisbon, Portugal}, author = {Cimiano, Philipp and Hotho, Andreas and Staab, Steffen}, booktitle = {Proceedings of the Conference on Languages Resources and Evaluation (LREC)}, interhash = {9374d126c328dab48f52854f73d6db4f}, intrahash = {3bc6e5a51dba862da1b7b3b6ac563370}, month = MAY, publisher = {ELRA - European Language Ressources Association}, title = {Clustering Ontologies from Text}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/lrec04.pdf}, year = 2004 } @inproceedings{hotho_pkdd03, author = {Hotho, A. and Staab, S. and Stumme, G.}, booktitle = {Proc. of the 7th European Conference on Principles and Practice of Knowledge Discovery in Databases, PKDD}, interhash = {cf66183151a5d94a0941ac6d5089ae89}, intrahash = {c1bb26aa5d4801542f832ffab70c82e5}, pages = {217-228}, series = {LNCS}, title = {Explaining Text Clustering Results using Semantic Structures}, volume = 2838, year = 2003 } @inproceedings{conf/iis/StaabH03, author = {Staab, Steffen and Hotho, Andreas}, booktitle = {Intelligent Information Processing and Web Mining, Proceedings of the International IIS: IIPWM'03 Conference held in Zakopane}, interhash = {dcb3c9710a44a43f9d8b17c5fc2b0f8c}, intrahash = {d773061117a913428968cc99c6e1ec0f}, isbn = {3-540-00843-8}, pages = {451-452}, title = {Ontology-based Text Document Clustering.}, url = {http://dblp.uni-trier.de/db/conf/iis/iis2003.html#StaabH03}, year = 2003 } @article{kostoff, abstract = {Literature-related discovery (LRD) is the linking of two or more literature concepts that have heretofore not been linked (i.e., disjoint), in order to produce novel, interesting, plausible, and intelligible knowledge (i.e., potential discovery). The open discovery systems (ODS) component of LRD starts with a problem to be solved, and generates solutions to that problem through potential discovery. We have been using ODS LRD to identify potential treatments or preventative actions for challenging medical problems, among myriad other applications. This paper describes the second medical problem we addressed (cataract) using ODS LRD; the first problem addressed was Raynaud's Phenomenon (RP), and was described in the third paper of this Special Issue. Cataract was selected because it is ubiquitous globally, appears intractable to all forms of treatment other than surgical removal of cataracts, and is a major cause of blindness in many developing countries. The ODS LRD study had three objectives: a) identify non-drug non-surgical treatments that would 1) help prevent cataracts, or 2) reduce the progression rate of cataracts, or 3) stop the progression of cataracts, or 4) maybe even reverse the progression of cataracts; b) demonstrate that we could solve an ODS LRD problem with no prior knowledge of any results or prior work (unlike the case with the RP problem); c) determine whether large time savings in the discovery process were possible relative to the time required for conducting the RP study. To that end, we used the MeSH taxonomy of MEDLINE to restrict potential discoveries to selected semantic classes, as a substitute for the manually-intensive process used in the RP study to restrict potential discoveries to selected semantic classes. We also used additional semantic filtering to identify potential discovery within the selected semantic classes. All these goals were achieved. As will be shown, we generated large amounts of potential discovery in more than an order of magnitude less time than required for the RP study. We identified many non-drug non-surgical treatments that may be able to reduce or even stop the progression rate of cataracts. Time, and much testing, will determine whether this is possible. Finally, the methodology has been developed to the point where ODS LRD problems can be solved with no results or knowledge of any prior work.}, author = {Kostoff, Ronald N.}, interhash = {45ce0cd73dd62182ce1e447ba9fe71eb}, intrahash = {b9359f79985da9b9677340ffda849e74}, journal = {Technological Forecasting and Social Change}, pages = {--}, title = {Literature-related discovery (LRD): Potential treatments for cataracts}, url = {http://www.sciencedirect.com/science/article/B6V71-4RDB8SC-9/2/8991fe8968a0ef12f22ed7e9ac9d7c4f}, volume = {In Press, Corrected Proof}, year = 2007 } @inproceedings{sanderson99-deriving, author = {Sanderson, Mark and Croft, William Bruce}, booktitle = {Proceedings of the 22nd Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, SIGIR'99}, interhash = {b351eb1a827b4d024323c4706035c938}, intrahash = {d15caaaea82b6df0747cc298a8b13556}, lastdatemodified = {2007-04-14}, lastname = {Sanderson}, own = {notown}, pages = {206--213}, pdf = {sanderson99-deriving.pdf}, read = {notread}, title = {Deriving concept hierarchies from text}, year = 1999 } @inproceedings{hotho03wordnet, address = {Toronto}, author = {Hotho, A and Staab, S. and Stumme, G.}, booktitle = {Proc. SIGIR Semantic Web Workshop}, comment = {alpha}, interhash = {c2a9a89ce20cef90a1e78d34dc2c2afe}, intrahash = {04c7d86337d68e4ed9ae637029c43414}, title = {Wordnet improves text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003wordnet.pdf}, year = 2003 } @techreport{hotho03textclustering, abstract = {Text document clustering plays an important role in providing intuitive navigation and browsing mechanisms by organizing large amounts of information into a small number of meaningful clusters. Standard partitional or agglomerative clustering methods efficiently compute results to this end. However, the bag of words representation used for these clustering methods is often unsatisfactory as it ignores relationships between important terms that do not co-occur literally. Also, it is mostly left to the user to find out why a particular partitioning has been achieved, because it is only specified extensionally. In order to deal with the two problems, we integrate background knowledge into the process of clustering text documents. First, we preprocess the texts, enriching their representations by background knowledge provided in a core ontology — in our application Wordnet. Then, we cluster the documents by a partitional algorithm. Our experimental evaluation on Reuters newsfeeds compares clustering results with pre-categorizations of news. In the experiments, improvements of results by background knowledge compared to the baseline can be shown for many interesting tasks. Second, the clustering partitions the large number of documents to a relatively small number of clusters, which may then be analyzed by conceptual clustering. In our approach, we applied Formal Concept Analysis. Conceptual clustering techniques are known to be too slow for directly clustering several hundreds of documents, but they give an intensional account of cluster results. They allow for a concise description of commonalities and distinctions of different clusters. With background knowledge they even find abstractions like “food” (vs. specializations like “beef” or “corn”). Thus, in our approach, partitional clustering reduces first the size of the problem such that it becomes tractable for conceptual clustering, which then facilitates the understanding of the results.}, author = {Hotho, Andreas and Staab, Steffen and Stumme, Gerd}, comment = {alpha}, institution = {University of Karlsruhe, Institute AIFB}, interhash = {0bc7c3fc1273355f45c8970a7ea58f97}, intrahash = {61d58db419af0dbc3681432588219c3d}, title = {Text Clustering Based on Background Knowledge}, type = {Technical Report }, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003text.pdf}, volume = 425, year = 2003 } @inproceedings{hotho03ontologies, address = {Melbourne, Florida}, author = {Hotho, Andreas and Staab, Steffen and Stumme, Gerd}, booktitle = {Proceedings of the 2003 IEEE International Conference on Data Mining}, comment = {alpha}, interhash = {b56c36d6d9c9ca9e6bd236a0f92415a5}, intrahash = {57a39c81cff1982dbefed529be934bee}, month = {November 19-22,}, pages = {541-544 (Poster}, publisher = {IEEE {C}omputer {S}ociety}, title = {Ontologies improve text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003ontologies.pdf}, year = 2003 }