@inproceedings{conf/dis/PontiTK11, author = {Ponti, Giovanni and Tagarelli, Andrea and Karypis, George}, booktitle = {Discovery Science}, crossref = {conf/dis/2011}, editor = {Elomaa, Tapio and Hollmén, Jaakko and Mannila, Heikki}, ee = {http://dx.doi.org/10.1007/978-3-642-24477-3_21}, interhash = {1d2b8fd777a36c3c42c10dac886d5d25}, intrahash = {af476c498b77848fa7c8121c8955a307}, isbn = {978-3-642-24476-6}, pages = {247-261}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {A Statistical Model for Topically Segmented Documents.}, url = {http://dblp.uni-trier.de/db/conf/dis/dis2011.html#PontiTK11}, volume = 6926, year = 2011 } @article{journals/ml/DuBJ10, author = {Du, Lan and Buntine, Wray L. and Jin, Huidong}, ee = {http://dx.doi.org/10.1007/s10994-010-5197-4}, interhash = {f39304f04fa411cc2c9232aa7eb83b83}, intrahash = {286291dfe97008c5bda330ffc0b72af1}, journal = {Machine Learning}, number = 1, pages = {5-19}, title = {A segmented topic model based on the two-parameter Poisson-Dirichlet process.}, url = {http://dblp.uni-trier.de/db/journals/ml/ml81.html#DuBJ10}, volume = 81, year = 2010 } @inproceedings{stvilia2005assessing, abstract = {Effective information quality analysis needs powerful yet easy ways to obtain metrics. The English version of Wikipedia provides an extremely interesting yet challenging case for the study of Information Quality dynamics at both macro and micro levels. We propose seven {IQ} metrics which can be evaluated automatically and test the set on a representative sample of Wikipedia content. The methodology of the metrics construction and the results of tests, along with a number of statistical characterizations of Wikipedia articles, their content construction, process metadata and social context are reported.}, address = {Cambridge, MA}, author = {Stvilia, Besiki and Twidale, Michael B. and Smith, Linda C. and Gasser, Les}, booktitle = {Proceedings of the 2005 International Conference on Information Quality}, interhash = {b84acb7b890edee9c53c216b0faadbec}, intrahash = {33be94d76729286b2bfc3d31a98f88db}, pages = {442--454}, publisher = {MIT}, title = {Assessing information quality of a community-based encyclopedia}, year = 2005 } @inproceedings{hu2007measuring, abstract = {Wikipedia has grown to be the world largest and busiest free encyclopedia, in which articles are collaboratively written and maintained by volunteers online. Despite its success as a means of knowledge sharing and collaboration, the public has never stopped criticizing the quality of Wikipedia articles edited by non-experts and inexperienced contributors. In this paper, we investigate the problem of assessing the quality of articles in collaborative authoring of Wikipedia. We propose three article quality measurement models that make use of the interaction data between articles and their contributors derived from the article edit history. Our B<scp>asic</scp> model is designed based on the mutual dependency between article quality and their author authority. The P<scp>eer</scp>R<scp>eview</scp> model introduces the review behavior into measuring article quality. Finally, our P<scp>rob</scp>R<scp>eview</scp> models extend P<scp>eer</scp>R<scp>eview</scp> with partial reviewership of contributors as they edit various portions of the articles. We conduct experiments on a set of well-labeled Wikipedia articles to evaluate the effectiveness of our quality measurement models in resembling human judgement.}, acmid = {1321476}, address = {New York, NY, USA}, author = {Hu, Meiqun and Lim, Ee-Peng and Sun, Aixin and Lauw, Hady Wirawan and Vuong, Ba-Quy}, booktitle = {Proceedings of the Sixteenth ACM Conference on Conference on Information and Knowledge Management}, doi = {10.1145/1321440.1321476}, interhash = {7fceff7d0b5943b21f66d970cfd65ccb}, intrahash = {cd9077443f7519e9cdce492858753632}, isbn = {978-1-59593-803-9}, location = {Lisbon, Portugal}, numpages = {10}, pages = {243--252}, publisher = {ACM}, series = {CIKM '07}, title = {Measuring article quality in wikipedia: models and evaluation}, url = {http://doi.acm.org/10.1145/1321440.1321476}, year = 2007 } @inproceedings{zhou2005document, abstract = {The quality of document content, which is an issue that is usually ignored for the traditional ad hoc retrieval task, is a critical issue for Web search. Web pages have a huge variation in quality relative to, for example, newswire articles. To address this problem, we propose a document quality language model approach that is incorporated into the basic query likelihood retrieval model in the form of a prior probability. Our results demonstrate that, on average, the new model is significantly better than the baseline (query likelihood model) in terms of precision at the top ranks.}, acmid = {1099652}, address = {New York, NY, USA}, author = {Zhou, Yun and Croft, W. Bruce}, booktitle = {Proceedings of the 14th ACM International Conference on Information and Knowledge Management}, doi = {10.1145/1099554.1099652}, interhash = {01264e5f48959d326724b405d3898337}, intrahash = {d190feee02f804aea11f19979d3642b8}, isbn = {1-59593-140-6}, location = {Bremen, Germany}, numpages = {2}, pages = {331--332}, publisher = {ACM}, series = {CIKM '05}, title = {Document quality models for web ad hoc retrieval}, url = {http://doi.acm.org/10.1145/1099554.1099652}, year = 2005 } @book{lamport86latex, author = {Lamport, Leslie}, interhash = {299a6bdbe8ea410f3e674b60782e83e5}, intrahash = {555cb6b96466e755bd0dedb9b04f06ab}, publisher = {Addison-Wesley}, title = {LaTeX: A Document Preparation System}, year = 1986 } @inproceedings{hotho03wordnet, address = {Toronto}, author = {Hotho, A and Staab, S. and Stumme, G.}, booktitle = {Proc. SIGIR Semantic Web Workshop}, comment = {alpha}, interhash = {c2a9a89ce20cef90a1e78d34dc2c2afe}, intrahash = {04c7d86337d68e4ed9ae637029c43414}, title = {Wordnet improves text document clustering}, url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2003/hotho2003wordnet.pdf}, year = 2003 } @article{fortuna2005vtd, author = {Fortuna, B. and Grobelnik, M. and Mladeni{\'c}, D.}, interhash = {3db2674afa1f013492af950b6606b780}, intrahash = {cc348c82464832a3f6a165978973e179}, journal = {Special Issue: Hot Topics in European Agent Research I Guest Editors: Andrea Omicini}, pages = {497--502}, title = {{Visualization of text document corpus}}, volume = 29, year = 2005 } @article{journals/ml/ZhaoK04, author = {Zhao, Ying and Karypis, George}, date = {2005-12-08}, ee = {http://dx.doi.org/10.1023/B:MACH.0000027785.44527.d6}, interhash = {900e28ac463ff44cbadf055cfbcf92ea}, intrahash = {71ea6e1192ea34ac8193867c2512927a}, journal = {Machine Learning}, number = 3, pages = {311-331}, title = {Empirical and Theoretical Comparisons of Selected Criterion Functions for Document Clustering}, url = {http://glaros.dtc.umn.edu/gkhome/fetch/papers/crfunML04.pdf}, volume = 55, year = 2004 } @article{journals/tois/ChowdhuryFGM02, author = {Chowdhury, Abdur and Frieder, Ophir and Grossman, David A. and McCabe, M. Catherine}, date = {2003-11-25}, ee = {http://doi.acm.org/10.1145/506309.506311}, interhash = {4357db306875755262451e702911ebe0}, intrahash = {24249e2a7b8b809050f9083fc75d3c18}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {171-191}, title = {Collection statistics for fast duplicate document detection.}, url = {http://dblp.uni-trier.de/db/journals/tois/tois20.html#ChowdhuryFGM02}, volume = 20, year = 2002 } @inproceedings{conf/icdm/PopesculULP03, author = {Popescul, Alexandrin and Ungar, Lyle H. and Lawrence, Steve and Pennock, David M.}, booktitle = {ICDM}, crossref = {conf/icdm/2003}, date = {2004-01-28}, ee = {http://csdl.computer.org/comp/proceedings/icdm/2003/1978/00/19780275abs.htm}, interhash = {3bcb76c6628b1752db555f86fe39429e}, intrahash = {7cdd6b0791fcdf17ec6d404b55f12c5c}, isbn = {0-7695-1978-4}, pages = {275-282}, publisher = {IEEE Computer Society}, title = {Statistical Relational Learning for Document Mining.}, url = {http://www.cis.upenn.edu/~popescul/Publications/popescul03dm.pdf}, year = 2003 } @inproceedings{steinbach00comparison, author = {Steinbach, M. and Karypis, G. and Kumar, V.}, booktitle = {KDD Workshop on Text Mining}, interhash = {3340fbf75ada2ccb45a50dd5832f5f07}, intrahash = {10e5c1e3ff54d9dce505a231f8ae7b32}, title = {A comparison of document clustering techniques}, url = {http://citeseer.ist.psu.edu/steinbach00comparison.html}, year = 2000 } @article{1131907, address = {New York, NY, USA}, author = {Tjhi, William-Chandra and Chen, Lihui}, doi = {http://dx.doi.org/10.1016/j.patrec.2005.07.012}, interhash = {1e398099e0879fba9a7b874a10a4fd20}, intrahash = {c5916b7f616900d5dbe103395450e83c}, issn = {0167-8655}, journal = {Pattern Recogn. Lett.}, number = 3, pages = {151--159}, publisher = {Elsevier Science Inc.}, title = {A partitioning based algorithm to fuzzy co-cluster documents and words}, volume = 27, year = 2006 } @book{lamport86latex, author = {Lamport, Leslie}, interhash = {299a6bdbe8ea410f3e674b60782e83e5}, intrahash = {555cb6b96466e755bd0dedb9b04f06ab}, publisher = {Addison-Wesley}, title = {LaTeX: A Document Preparation System}, year = 1986 } @inproceedings{KS97, author = {Koller, D. and Sahami, M.}, bibsource = {DBLP, http://dblp.uni-trier.de}, booktitle = {Proceedings of the 14th International Conference on Machine Learning (ML), Nashville, Tennessee,July 1997}, interhash = {b560ab99c39d28d5a4a03a3bad8c32b8}, intrahash = {65ee2a885dbfebd7e0c456e9754e0579}, pages = {170--178}, title = {Hierarchically classifying documents using very few words}, year = 1997 } @inproceedings{Maedcheetalsubmitted, address = {Hawaii}, author = {Maedche, A. and Ehrig, M. and Handschuh, S. and Stojanovic, L. and Volz, R.}, booktitle = {Proceedings of the Eleventh International World Wide Web Conference WWW-2002}, interhash = {833fc61ede44e31a5af6c77c86baa43a}, intrahash = {f20b5a51398bb01659ad099794ebf06d}, isbn = {90-74821-43-X}, title = {Ontology-Focused Crawling of Documents and Relational Metadata}, year = 2002 }