@article{istván2009latent, abstract = {In this paper we introduce and evaluate a technique for applying latent Dirichlet allocation to supervised semantic categorization of documents. In our setup, for every category an own collection of topics is assigned, and for a labeled training documentonly topics from its category are sampled. Thus, compared to the classical LDA that processes the entire corpus in one, weessentially build separate LDA models for each category with the category-specific topics, and then these topic collectionsare put together to form a unified LDA model. For an unseen document the inferred topic distribution gives an estimation howmuch the document fits into the category.}, author = {Bíró, István and Szabó, Jácint}, interhash = {f4c5b12409be4108320cba5b8fd18c45}, intrahash = {2db7477d992284eabea47e1c9669ab5a}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {430--441}, title = {Latent Dirichlet Allocation for Automatic Document Categorization}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_28}, year = 2009 } @inproceedings{benczur2008survey, abstract = {While Web archive quality is endangered by Web spam, a side effect of the high commercial value of top-ranked search-engine results, so farWeb spam filtering technologies are rarely used byWeb archivists. In this paper we make the first attempt to disseminate existing methodology and envision a solution for Web archives to share knowledge and unite efforts in Web spam hunting. We survey the state of the art inWeb spam filtering illustrated by the recent Web spam challenge data sets and techniques and describe the filtering solution for archives envisioned in the LiWA—Living Web Archives project.}, address = {Aaarhus, Denmark}, author = {Benczúr, András A. and Siklósi, Dávid and Szabó, Jácint and Bíró, István and Fekete, Zsolt and and Miklós Kurucz and Pereszlényi, Attila and Rácz, Simon and Szabó, Adrienn}, booktitle = {Proceedings of the 8th International Web Archiving Workshop IWAW'08}, interhash = {b09d09a4d29ba2a80a5a29b9a76ed5f0}, intrahash = {911a912a75e50451923522223f7717e8}, month = sep, title = {Web Spam: a Survey with Vision for the Archivist}, url = {http://iwaw.europarchive.org/08/IWAW2008-Benczur.pdf}, year = 2008 }