@inproceedings{conf/wsdm/KohlschutterFN10, author = {Kohlschütter, Christian and Fankhauser, Peter and Nejdl, Wolfgang}, booktitle = {Proc. of 3rd ACM International Conference on Web Search and Data Mining New York City, NY USA (WSDM 2010).}, interhash = {25ea118166ef2f0d5597ca90fa702c9d}, intrahash = {dbc8464d9a298afa49d607d65f2160e2}, title = {Boilerplate Detection using Shallow Text Features}, year = 2010 } @inproceedings{Teufel01task-basedevaluation, abstract = {We present a novel method for task-based evalua- tion of summaries of scientific articles. The task we propose is a question-answering task, where the questions are about the relatedness of the current paper to prior research. This evaluation method is time-efficient with respect to material preparation and data collection, so that it is possible to test against many different baselines, something that is not usually feasible in evaluations by relevance decision. We use this methodology to evaluate the quality of summaries our system produces. These summaries are designed to describe the contribution of a scientific article in relation to other work. The re- sults show that this type of summary is indeed more useful than the baselines (random sentences, keyword lists and generic author-written summaries), and nearly as useful as the full texts.}, author = {Teufel, Simone}, booktitle = {In Workshop Automatic Summarization, NAACL}, interhash = {ed0c6de01aa6b0a3ef369627eb689cf3}, intrahash = {b95470c8eae5d8f0372d20215c35f236}, pages = {12--21}, title = {Task-Based Evaluation of Summary Quality: Describing Relationships between Scientific Papers}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.12.8139}, year = 2001 } @inproceedings{Kando99textstructure, abstract = {This paper describes an information retrieval system with the function to support user's use of the retrieved documents using the text-level structure of documents. The text-level structure of each document is described by the occurrence of typical functional components in the text. Automatic detection of the components has been attempted in previous works using surface-level language processing. The proposed system firstly utilizes the text structure to conduct high-precision searches of documents or passages by distinguishing the role or function each concept plays in the text. It also allows browsing or skimming of retrieved texts, creating summaries on-the-fly with various levels of condensation specified by the user. Moreover, the system can search and display any unit of a text such as a sentence, a paragraph or a chapter. Comparison of relevant passages in retrieved documents across multiple texts is helpful for users to examine, analyze, compare and integrate texts and...}, author = {Kando, Noriko}, booktitle = {IN PROCEEDINGS OF THE 4TH INTERNATIONAL WORKSHOP ON INFORMATION RETRIEVAL WITH ASIAN LANGUAGES}, interhash = {7cd913f37b21579ee636dc5036cca292}, intrahash = {94d62d9f3ef51731ea1edc03662616d9}, pages = {126--135}, title = {Text Structure Analysis as a Tool to Make Retrieved Documents Usable}, url = {http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.34.3165}, year = 1999 } @article{cimiano05learning, author = {Cimiano, Philipp and Hotho, Andreas and Staab, Steffen}, ee = {http://www.jair.org/papers/paper1648.html}, interhash = {4c09568cff62babd362aab03095f4589}, intrahash = {eaaf0e4b3a8b29fab23b6c15ce2d308d}, journal = {Journal on Artificial Intelligence Research}, pages = {305-339}, title = {Learning Concept Hierarchies from Text Corpora using Formal Concept Analysis}, url = {http://dblp.uni-trier.de/db/journals/jair/jair24.html#CimianoHS05}, volume = 24, year = 2005 } @article{mann1988rhetorical, author = {Mann, William C and Thompson, Sandra A}, interhash = {e8feceeba43734d376da50554b8071d2}, intrahash = {8cedc5c82592cf1bbcfe5fa685fc5c67}, journal = {Text}, number = 3, pages = {243--281}, title = {Rhetorical structure theory: Toward a functional theory of text organization}, url = {http://scholar.google.com/scholar.bib?q=info:BEw8CIWbucoJ:scholar.google.com/&output=citation&scisig=AAGBfm0AAAAAU3X_1Dq4ULnWfFzMeRsqGJcha1fReMSl&scisf=4&hl=en}, volume = 8, year = 1988 } @article{Salton1996127, author = {Salton, Gerard and Allan, James and Singhal, Amit}, doi = {http://dx.doi.org/10.1016/S0306-4573(96)85001-1}, interhash = {2fc2920e08a6a7dda2c256e62fc1e349}, intrahash = {8674111d30a3c67d5d8a8b847cebb771}, issn = {0306-4573}, journal = {Information Processing & Management }, number = 2, pages = {127 - 138}, title = {Automatic text decomposition and structuring }, url = {http://www.sciencedirect.com/science/article/pii/S0306457396850011}, volume = 32, year = 1996 } @inproceedings{conf/birthday/BloehdornBCGHLMMSSV11, author = {Bloehdorn, Stephan and Blohm, Sebastian and Cimiano, Philipp and Giesbrecht, Eugenie and Hotho, Andreas and Lösch, Uta and Mädche, Alexander and Mönch, Eddie and Sorg, Philipp and Staab, Steffen and Völker, Johanna}, booktitle = {Foundations for the Web of Information and Services}, crossref = {conf/birthday/2011studer}, editor = {Fensel, Dieter}, ee = {http://dx.doi.org/10.1007/978-3-642-19797-0_7}, interhash = {db48314326a36fc4ac8770cba2c20e49}, intrahash = {21be5153a8f491c9f209d57ce7662387}, isbn = {978-3-642-19796-3}, pages = {115-142}, publisher = {Springer}, title = {Combining Data-Driven and Semantic Approaches for Text Mining.}, url = {http://dblp.uni-trier.de/db/conf/birthday/studer2011.html#BloehdornBCGHLMMSSV11}, year = 2011 } @inproceedings{conf/mldm/ToivonenVVBV01, author = {Toivonen, Jarmo and Visa, Ari and Vesanen, Tomi and Back, Barbro and Vanharanta, Hannu}, booktitle = {MLDM}, crossref = {conf/mldm/2001}, editor = {Perner, Petra}, ee = {http://dx.doi.org/10.1007/3-540-44596-X_15}, interhash = {2121b03b46ecdde012bae15ca8cf8ce6}, intrahash = {2f23db9219b4d693acf15d7401684499}, isbn = {3-540-42359-1}, pages = {184-195}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Validation of Text Clustering Based on Document Contents.}, url = {http://dblp.uni-trier.de/db/conf/mldm/mldm2001.html#ToivonenVVBV01}, volume = 2123, year = 2001 } @inproceedings{conf/icdm/YassineH10, author = {Yassine, Mohamed and Hajj, Hazem}, booktitle = {ICDM Workshops}, crossref = {conf/icdm/2010w}, editor = {Fan, Wei and Hsu, Wynne and Webb, Geoffrey I. and Liu, Bing and Zhang, Chengqi and Gunopulos, Dimitrios and Wu, Xindong}, ee = {http://dx.doi.org/10.1109/ICDMW.2010.75}, interhash = {72ae8c258d6559e4a90370453ecc2acc}, intrahash = {8b0afeee143cec94f3058c214ae38c6f}, pages = {1136-1142}, publisher = {IEEE Computer Society}, title = {A Framework for Emotion Mining from Text in Online Social Networks.}, url = {http://dblp.uni-trier.de/db/conf/icdm/icdmw2010.html#YassineH10}, year = 2010 } @misc{Rubin2011, abstract = { Machine learning approaches to multi-label document classification have (to date) largely relied on discriminative modeling techniques such as support vector machines. A drawback of these approaches is that performance rapidly drops off as the total number of labels and the number of labels per document increase. This problem is amplified when the label frequencies exhibit the type of highly skewed distributions that are often observed in real-world datasets. In this paper we investigate a class of generative statistical topic models for multi-label documents that associate individual word tokens with different labels. We investigate the advantages of this approach relative to discriminative models, particularly with respect to classification problems involving large numbers of relatively rare labels. We compare the performance of generative and discriminative approaches on document labeling tasks ranging from datasets with several thousand labels to datasets with tens of labels. The experimental results indicate that generative models can achieve competitive multi-label classification performance compared to discriminative methods, and have advantages for datasets with many labels and skewed label frequencies. }, author = {Rubin, Timothy N. and Chambers, America and Smyth, Padhraic and Steyvers, Mark}, interhash = {e09d5d8587756d460a5d834025e75aac}, intrahash = {f8a5a3958ae264d19c7f5415eb7f0bce}, note = {cite arxiv:1107.2462}, title = {Statistical Topic Models for Multi-Label Document Classification}, url = {http://arxiv.org/abs/1107.2462}, year = 2011 } @book{srivastava2009mining, abstract = {Giving a broad perspective of the field from numerous vantage points, 'Text Mining' focuses on statistical methods for text mining and analysis. It examines methods to automatically cluster and classify text documents and applies these methods in a variety of areas.}, address = {Boca Raton, FL}, author = {Srivastava, Asho and Sahami, Mehran.}, interhash = {290eabe518274b6fbcc73a106a7d52a6}, intrahash = {45ab79501c114299142864becfa6c841}, isbn = {9781420059403 1420059408}, pages = {--}, publisher = {CRC Press}, refid = {144226505}, title = {Text mining : classification, clustering, and applications}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9781420059403}, year = 2009 } @article{march06crane, author = {Crane, Gregory}, doi = {10.1045/march2006-crane}, interhash = {36d4825e3189d89195693d1449e9aaea}, intrahash = {eea7ae2ac1480c84f87544f2942c28f2}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March }, number = 3, title = {What Do You Do with a Million Books?}, url = {http://www.dlib.org/dlib/march06/crane/03crane.html}, volume = 12, year = 2006 } @article{Luo20091271, abstract = {Clustering is a very powerful data mining technique for topic discovery from text documents. The partitional clustering algorithms, such as the family of k-means, are reported performing well on document clustering. They treat the clustering problem as an optimization process of grouping documents into k clusters so that a particular criterion function is minimized or maximized. Usually, the cosine function is used to measure the similarity between two documents in the criterion function, but it may not work well when the clusters are not well separated. To solve this problem, we applied the concepts of neighbors and link, introduced in [S. Guha, R. Rastogi, K. Shim, ROCK: a robust clustering algorithm for categorical attributes, Information Systems 25 (5) (2000) 345-366], to document clustering. If two documents are similar enough, they are considered as neighbors of each other. And the link between two documents represents the number of their common neighbors. Instead of just considering the pairwise similarity, the neighbors and link involve the global information into the measurement of the closeness of two documents. In this paper, we propose to use the neighbors and link for the family of k-means algorithms in three aspects: a new method to select initial cluster centroids based on the ranks of candidate documents; a new similarity measure which uses a combination of the cosine and link functions; and a new heuristic function for selecting a cluster to split based on the neighbors of the cluster centroids. Our experimental results on real-life data sets demonstrated that our proposed methods can significantly improve the performance of document clustering in terms of accuracy without increasing the execution time much.}, author = {Luo, Congnan and Li, Yanjun and Chung, Soon M.}, doi = {10.1016/j.datak.2009.06.007}, interhash = {bf59c4cf26cbc35d6142630b34a66d37}, intrahash = {13483e90d8b46ef9435ec71473aacee4}, issn = {0169-023X}, journal = {Data & Knowledge Engineering}, note = {Including Special Section: Conference on Privacy in Statistical Databases (PSD 2008) - Six selected and extended papers on Database Privacy}, number = 11, pages = {1271 - 1288}, title = {Text document clustering based on neighbors}, url = {http://www.sciencedirect.com/science/article/B6TYX-4WNB4Y8-1/2/1dcd00d9c049988da53b44a526dd6555}, volume = 68, year = 2009 } @book{feldman2006mining, asin = {0521836573}, author = {Feldman, Ronen and Sanger, James}, dewey = {005.74}, ean = {9780521836579}, interhash = {14cb9e63c6dca7830675c73578dcc30b}, intrahash = {a310b253098a92c9f6352f568c1a9c37}, isbn = {0521836573}, publisher = {Cambridge University Press}, title = {The Text Mining Handbook: Advanced Approaches in Analyzing Unstructured Data}, url = {http://www.amazon.com/Text-Mining-Handbook-Approaches-Unstructured/dp/0521836573/ref=sr_1_1?s=books&ie=UTF8&qid=1295265273&sr=1-1}, year = 2007 } @inproceedings{conf/sigir/HuFCZLYC08, author = {Hu, Jian and Fang, Lujun and Cao, Yang and Zeng, Hua-Jun and Li, Hua and Yang, Qiang and Chen, Zheng}, booktitle = {SIGIR}, crossref = {conf/sigir/2008}, date = {2008-07-27}, editor = {Myaeng, Sung-Hyon and Oard, Douglas W. and Sebastiani, Fabrizio and Chua, Tat-Seng and Leong, Mun-Kew}, ee = {http://doi.acm.org/10.1145/1390334.1390367}, interhash = {0a2878165034dcdfacb9045608ec482a}, intrahash = {76f863a12c0b983ec67682deaec1ada4}, isbn = {978-1-60558-164-4}, pages = {179-186}, publisher = {ACM}, title = {Enhancing text clustering by leveraging Wikipedia semantics.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2008.html#HuFCZLYC08}, year = 2008 } @article{wiley2008, address = {Department of Information Systems Engineering, Ben-Gurion University of the Negev, Beer-Sheva 84105, Israel; Department of Computer Science and Engineering, University of South Florida, Tampa, FL 33620}, author = {Markov, A. and Last, M. and Kandel, A.}, doi = {10.1002/int.20290}, interhash = {f56f7f1f800242cbbc9596d4f1ab889f}, intrahash = {512b4c9686e5d10763dc7258a10857b4}, journal = {International Journal of Intelligent Systems}, number = 6, pages = {654-679}, publisher = {Copyright © 2008 Wiley Periodicals, Inc., A Wiley Company}, title = {The hybrid representation model for web document classification}, url = {http://dx.doi.org/10.1002/int.20290}, volume = 23, year = 2008 } @article{carpena:035102, author = {Carpena, P. and Bernaola-Galv\'{a}n, P. and Hackenberg, M. and Coronado, A. V. and Oliver, J. L.}, doi = {10.1103/PhysRevE.79.035102}, eid = {035102}, interhash = {3444159872c65ea89d007d1838686acc}, intrahash = {34dcb1eee3ffa31ff4eb77087343c146}, journal = {Physical Review E (Statistical, Nonlinear, and Soft Matter Physics)}, number = 3, numpages = {4}, pages = 035102, publisher = {APS}, title = {Level statistics of words: Finding keywords in literary texts and symbolic sequences}, url = {http://bioinfo2.ugr.es/TextKeywords/}, volume = 79, year = 2009 } @book{UBMA_280507895, address = {Herdecke ; Bochum}, author = {Heyer, Gerhard and Quasthoff, Uwe and Wittig, Thomas}, edition = {1. korr. Nachdr.}, interhash = {d6fa152f7becd0a9d5155f748c29ac22}, intrahash = {692999b8760981d3b2e0b9103b9d3b0f}, isbn = {978-3-937137-30-8}, pages = {XII, 348 S.}, publisher = {W3L-Verl.}, series = {IT lernen}, title = {Text Mining: Wissensrohstoff Text}, url = {http://aleph.bib.uni-mannheim.de/F/?func=find-b&request=280507895&find_code=020&adjacent=N&local_base=MAN01PUBLIC&x=0&y=0}, year = 2008 } @incollection{bloehdorn2006learning, abstract = {Recent work has shown improvements in text clustering and classification tasks by integrating conceptual features extracted from ontologies. In this paper we present text mining experiments in the medical domain in which the ontological structures used are acquired automatically in an unsupervised learning process from the text corpus in question. We compare results obtained using the automatically learned ontologies with those obtained using manually engineered ones. Our results show that both types of ontologies improve results on text clustering and classification tasks, whereby the automatically acquired ontologies yield a improvement competitive with the manually engineered ones. ER -}, author = {Bloehdorn, Stephan and Cimiano, Philipp and Hotho, Andreas}, booktitle = {From Data and Information Analysis to Knowledge Engineering}, doi = {http://dx.doi.org/10.1007/3-540-31314-1_40}, interhash = {cf1af505b638677f00b3d3d7a5903199}, intrahash = {bc1d40cf4fd64780ecf712b1e40f31de}, isbn = {978-3-540-31313-7}, pages = {334--341}, publisher = {Springer Berlin Heidelberg}, title = {Learning Ontologies to Improve Text Clustering and Classification}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2006/2006-03-gfkl05-bloehdorn-etal-learning-ontologies.pdf}, year = 2006 } @article{colas2006behavior, abstract = {Document classification has already been widely studied. In fact, some studies compared feature selection techniques or feature space transformation whereas some others compared the performance of different algorithms. Recently, following the risinginterest towards the Support Vector Machine, various studies showed that the SVM outperforms other classification algorithms.So should we just not bother about other classification algorithms and opt always for SVM?}, author = {Colas, Fabrice and Brazdil, Pavel}, interhash = {327719a4d7d383ea20592e32e1cd7c50}, intrahash = {da96ccc48a52cf844e16de4b5a889313}, journal = {Text, Speech and Dialogue}, pages = {45--52}, title = {On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks}, url = {http://dx.doi.org/10.1007/11846406_6}, year = 2006 }