@article{wiley2008, address = {Department of Information Systems Engineering, Ben-Gurion University of the Negev, Beer-Sheva 84105, Israel; Department of Computer Science and Engineering, University of South Florida, Tampa, FL 33620}, author = {Markov, A. and Last, M. and Kandel, A.}, doi = {10.1002/int.20290}, interhash = {f56f7f1f800242cbbc9596d4f1ab889f}, intrahash = {512b4c9686e5d10763dc7258a10857b4}, journal = {International Journal of Intelligent Systems}, number = 6, pages = {654-679}, publisher = {Copyright © 2008 Wiley Periodicals, Inc., A Wiley Company}, title = {The hybrid representation model for web document classification}, url = {http://dx.doi.org/10.1002/int.20290}, volume = 23, year = 2008 } @incollection{bloehdorn2006learning, abstract = {Recent work has shown improvements in text clustering and classification tasks by integrating conceptual features extracted from ontologies. In this paper we present text mining experiments in the medical domain in which the ontological structures used are acquired automatically in an unsupervised learning process from the text corpus in question. We compare results obtained using the automatically learned ontologies with those obtained using manually engineered ones. Our results show that both types of ontologies improve results on text clustering and classification tasks, whereby the automatically acquired ontologies yield a improvement competitive with the manually engineered ones. ER -}, author = {Bloehdorn, Stephan and Cimiano, Philipp and Hotho, Andreas}, booktitle = {From Data and Information Analysis to Knowledge Engineering}, doi = {http://dx.doi.org/10.1007/3-540-31314-1_40}, interhash = {cf1af505b638677f00b3d3d7a5903199}, intrahash = {bc1d40cf4fd64780ecf712b1e40f31de}, isbn = {978-3-540-31313-7}, pages = {334--341}, publisher = {Springer Berlin Heidelberg}, title = {Learning Ontologies to Improve Text Clustering and Classification}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2006/2006-03-gfkl05-bloehdorn-etal-learning-ontologies.pdf}, year = 2006 } @article{colas2006behavior, abstract = {Document classification has already been widely studied. In fact, some studies compared feature selection techniques or feature space transformation whereas some others compared the performance of different algorithms. Recently, following the risinginterest towards the Support Vector Machine, various studies showed that the SVM outperforms other classification algorithms.So should we just not bother about other classification algorithms and opt always for SVM?}, author = {Colas, Fabrice and Brazdil, Pavel}, interhash = {327719a4d7d383ea20592e32e1cd7c50}, intrahash = {da96ccc48a52cf844e16de4b5a889313}, journal = {Text, Speech and Dialogue}, pages = {45--52}, title = {On the Behavior of SVM and Some Older Algorithms in Binary Text Classification Tasks}, url = {http://dx.doi.org/10.1007/11846406_6}, year = 2006 } @inproceedings{PuWang:2007, abstract = {The exponential growth of text documents available on the Internet has created an urgent need for accurate, fast, and general purpose text classification algorithms. However, the "bag of words" representation used for these classification methods is often unsatisfactory as it ignores relationships between important terms that do not co-occur literally. In order to deal with this problem, we integrate background knowledge - in our application: Wikipedia - into the process of classifying text documents. The experimental evaluation on Reuters newsfeeds and several other corpus shows that our classification results with encyclopedia knowledge are much better than the baseline "bag of words " methods.}, author = {Wang, Pu and Hu, Jian and Zeng, Hua-Jun and Chen, Lijun and Chen, Zheng}, booktitle = {Data Mining, 2007. ICDM 2007. Seventh IEEE International Conference on}, doi = {10.1109/ICDM.2007.77}, interhash = {8a899b60047e20e162fc12b2ff6f8142}, intrahash = {66058efbca5abd1222f72c32365d23fa}, isbn = {978-0-7695-3018-5}, issn = {1550-4786}, pages = {332-341}, title = {Improving Text Classification by Using Encyclopedia Knowledge}, url = {ftp://ftp.computer.org/press/outgoing/proceedings/icdm07/Data/3018a332.pdf}, year = 2007 } @inproceedings{IfrimTW-ICML2005, address = {Bonn, Germany}, author = {Ifrim, Georgiana and Theobald, Martin and Weikum, Gerhard}, booktitle = {Proceedings of the 22nd International Conference on Machine Learning - Learning in Web Search (LWS 2005)}, editor = {Raedt, Luc De and Wrobel, Stefan}, interhash = {a54c4070e0fb55f5a084a0f088230a65}, intrahash = {57f8241941ed979455c3dbb90893020f}, isbn = {1-59593-180-5}, pages = {18--26}, title = {Learning Word-to-Concept Mappings for Automatic Text Classification}, url = {http://www.mpi-inf.mpg.de/~ifrim/publications/icml-lws05.pdf}, year = 2005 } @inproceedings{baker98distributional, address = {Melbourne, AU}, author = {Baker, L. Douglas and McCallum, Andrew K.}, booktitle = {Proceedings of {SIGIR}-98, 21st {ACM} International Conference on Research and Development in Information Retrieval}, editor = {Croft, W. Bruce and Moffat, Alistair and van Rijsbergen, Cornelis J. and Wilkinson, Ross and Zobel, Justin}, interhash = {f116fa6b3ef1eefecb8bf27dfaa53ee7}, intrahash = {e472dc4e61921ed15175756fcd9fea6a}, pages = {96--103}, publisher = {ACM Press, New York, US}, title = {Distributional clustering of words for text classification}, url = {citeseer.ist.psu.edu/baker98distributional.html}, year = 1998 } @inproceedings{lauser03, author = {Lauser, Boris and Hotho, Andreas}, booktitle = {Proc. of the 7th European Conference in Research and Advanced Technology for Digital Libraries, ECDL 2003}, interhash = {feb38928054a3691f83122b0172c5116}, intrahash = {8b298c325c6ecdb9c01e01057464ae2d}, pages = {140-151}, publisher = {Springer}, series = {LNCS}, title = {Automatic multi-label subject indexing in a multilingual environment}, volume = 2769, year = 2003 } @incollection{bloehdorn2006boosting, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Advances in Web Mining and Web Usage Analysis}, citeulike-article-id = {910161}, doi = {10.1007/11899402_10}, interhash = {9351de43861fe23833c5d074059f310a}, intrahash = {e297ea25cc01678c35501a44115aea8d}, isbn = {978-3-540-47127-1}, issn = {0302-9743}, pages = {149--166}, priority = {2}, publisher = {Springer}, series = {LNCS}, title = {Boosting for Text Classification with Semantic Features}, url = {http://dx.doi.org/10.1007/11899402_10}, vgwort = {32}, volume = 3932, year = 2006 } @inproceedings{conf/icdm/PopesculULP03, author = {Popescul, Alexandrin and Ungar, Lyle H. and Lawrence, Steve and Pennock, David M.}, booktitle = {ICDM}, crossref = {conf/icdm/2003}, date = {2004-01-28}, ee = {http://csdl.computer.org/comp/proceedings/icdm/2003/1978/00/19780275abs.htm}, interhash = {3bcb76c6628b1752db555f86fe39429e}, intrahash = {7cdd6b0791fcdf17ec6d404b55f12c5c}, isbn = {0-7695-1978-4}, pages = {275-282}, publisher = {IEEE Computer Society}, title = {Statistical Relational Learning for Document Mining.}, url = {http://www.cis.upenn.edu/~popescul/Publications/popescul03dm.pdf}, year = 2003 } @article{Sebastiani02, author = {Sebastiani, F.}, bb-further-address = {--Dordrecht--London}, interhash = {d945d9218673dad37dc2a06cbf9e554c}, intrahash = {0fe0d5dd12c2cb59dfc330e684ec4b4a}, journal = {ACM Computing Surveys}, number = 1, pages = {1--47}, title = {Machine learning in automated text categorization}, url = {http://faure.iei.pi.cnr.it/~fabrizio/Publications/ACMCS02.pdf}, volume = 34, year = 2002 } @inproceedings{bloehdorn2004icdm, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Proceedings of the Fourth IEEE International Conference on Data Mining}, interhash = {f18089d50fdc9c9e38c4fc1e350bdd4e}, intrahash = {7df6357c79445d811f4a9223e688da14}, month = NOV, pages = {331-334}, publisher = {IEEE Computer Society Press}, title = {Text Classification by Boosting Weak Learners based on Terms and Concepts}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/icdm04boosting.pdf}, year = 2004 } @inproceedings{bloehdorn04msw, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Proceedings of the MSW 2004 workshop at the 10th ACM SIGKDD Conference on Knowledge Discovery and Data Mining}, interhash = {9b4b685dda669fc66659f810ceb97890}, intrahash = {584e82bad6eb767636ce3ddd8f1ae233}, month = AUG, pages = {70-87}, title = {Boosting for Text Classification with Semantic Features}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/msw04bloehdorn.pdf}, year = 2004 } @incollection{TM05, address = {Southampton, UK}, author = {Sebastiani, Fabrizio}, booktitle = {Text Mining and its Applications to Intelligence, CRM and Knowledge Management}, editor = {Zanasi, Alessandro}, interhash = {0343f986403f021264f6eea761722fd6}, intrahash = {3b839b06f8139db217a9eb61d6015fc0}, pages = {109--129}, publisher = {WIT Press}, title = {Text Categorization}, url = {http://www.isti.cnr.it/People/F.Sebastiani/Publications/TM05.pdf}, year = 2005 } @article{Lew04, author = {Lewis, D. D. and Yang, Y. and Rose, T. G. and Li, F.}, interhash = {ff940c50e028cb53fc10f99ddd39fe3e}, intrahash = {0db455903d09c97f4f6ccbfb95c66f9e}, journal = {Journal of Machine Learning Research}, number = {Apr}, pages = {361--397}, title = {RCV1: A New Benchmark Collection for Text Categorization Research}, url = {http://www.jmlr.org/papers/volume5/lewis04a/lewis04a.pdf}, volume = 5, year = 2004 } @article{rose2002rcv, author = {Rose, T.G. and Stevenson, M. and Whitehead, M.}, interhash = {3a817ba1a6c5e81efa0acc7faaa790ad}, intrahash = {b88233fe20a4454a2af000ad260166e7}, journal = {Proceedings of the Third International Conference on Language Resources and Evaluation}, pages = {29--31}, title = {{The reuters corpus volume 1-from yesterday’s news to tomorrow’s language resources}}, url = {http://about.reuters.com/researchandstandards/corpus/LREC_camera_ready.pdf}, year = 2002 } @inproceedings{bloehdorn04Boosting, author = {Bloehdorn, Stephan and Hotho, Andreas}, booktitle = {Proceedings of the Workshop on Text-based Information Retrieval (TIR-04) at the 27th German Conference on Artificial Intelligence}, interhash = {34e79ac08abb52395ef1c70f03488a8d}, intrahash = {b4dd28d4b393a4d12650fa246176883a}, month = SEP, title = { Boosting for Text Classification with Semantic Features (reprint)}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/tir04final.pdf}, year = 2004 } @article{richter2005metadata, abstract = {During the last decade, the advance of machine-learning tools and algorithms has resulted in tremendous progress in the automated classification of documents. However, many classifiers base their classification decisions solely on document text and ignore metadata (such as authors, publication date, and author affiliation). In this project, automated classifiers using the k-Nearest Neighbour algorithm were developed for the classification of patents into two different classification systems. Those using metadata (in this case inventor names, applicant names and International Patent Classification codes) were compared with those ignoring it. The use of metadata could significantly improve the classification of patents with one classification system, improving classification accuracy from 70.8% up to 75.4%, which was highly statistically significant. However, the results for the other classification system were inconclusive: while metadata could improve the quality of the classifier for some experiments (recall increased from 66.0% to 68.9%, which was a small but nonetheless significant improvement), experiments with different parameters showed that it could also lead to a deterioration of quality (recall dropping as low as 61.0%). The study shows that metadata can play an extremely useful role in the classification of patents. Nonetheless, it must not be used indiscriminately but only after careful evaluation of its usefulness.}, author = {Richter and MacFarlane}, interhash = {c7749092c6e5a90cd43fe022fa398e0b}, intrahash = {d15595d5279e762207d67f2a9b688c37}, journal = {World Patent Information}, pages = {12-26}, title = {The impact of metadata on the accuracy of automated patent classification}, url = {http://dx.doi.org/10.1016/j.wpi.2004.08.001}, volume = 27, year = 2005 }