@inproceedings{siersdorfer2009automatic, abstract = {The analysis of the leading social video sharing platform YouTube reveals a high amount of redundancy, in the form of videos with overlapping or duplicated content. In this paper, we show that this redundancy can provide useful information about connections between videos. We reveal these links using robust content-based video analysis techniques and exploit them for generating new tag assignments. To this end, we propose different tag propagation methods for automatically obtaining richer video annotations. Our techniques provide the user with additional information about videos, and lead to enhanced feature representations for applications such as automatic data organization and search. Experiments on video clustering and classification as well as a user evaluation demonstrate the viability of our approach.}, acmid = {1572010}, address = {New York, NY, USA}, author = {Siersdorfer, Stefan and San Pedro, Jose and Sanderson, Mark}, booktitle = {Proceedings of the 32nd international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1571941.1572010}, interhash = {276b49e417d441ba50bfc6e4b85be1f3}, intrahash = {71c3a120e154ed135408292eb4b96278}, isbn = {978-1-60558-483-6}, location = {Boston, MA, USA}, numpages = {8}, pages = {395--402}, publisher = {ACM}, series = {SIGIR '09}, title = {Automatic video tagging using content redundancy}, url = {http://doi.acm.org/10.1145/1571941.1572010}, year = 2009 } @article{martin2011enhancing, author = {Atzmueller, Martin and Benz, Dominik and Doerfel, Stephan and Hotho, Andreas and Jäschke, Robert and Macek, Bjoern Elmar and Mitzlaff, Folke and Scholz, Christoph and Stumme, Gerd}, booktitle = {it - Information Technology}, comment = {doi: 10.1524/itit.2011.0631}, doi = {10.1524/itit.2011.0631}, interhash = {e57bff1f73b74e6f1fe79e4b40956c35}, intrahash = {1dc34c1620c45a9bbd548bb73f989aea}, issn = {16112776}, journal = {it - Information Technology}, month = may, number = 3, pages = {101--107}, publisher = {Oldenbourg Wissenschaftsverlag GmbH}, title = {Enhancing Social Interactions at Conferences}, url = {http://dx.doi.org/10.1524/itit.2011.0631}, volume = 53, year = 2011 } @article{blondel2008fasta, abstract = {We propose a simple method to extract the community structure of large networks. Our method is a heuristic method that is based on modularity optimization. It is shown to outperform all other known community detection methods in terms of computation time. Moreover, the quality of the communities detected is very good, as measured by the so-called modularity. This is shown first by identifying language communities in a Belgian mobile phone network of 2 million customers and by analysing a web graph of 118 million nodes and more than one billion links. The accuracy of our algorithm is also verified on ad hoc modular networks.}, author = {Blondel, Vincent D and Guillaume, Jean-Loup and Lambiotte, Renaud and Lefebvre, Etienne}, groups = {public}, interhash = {65254c1a703db2ce225cee4b56ea12ae}, intrahash = {7855df1049bee476ad64ee3c29c29f0f}, journal = {Journal of Statistical Mechanics: Theory and Experiment}, localfile = {/home/aynaud/biblio/articles/louvain.pdf}, number = 10, pages = {P10008 (12pp)}, timestamp = {2009-09-21 01:52:25}, title = {Fast unfolding of communities in large networks}, username = {dbenz}, volume = 2008, year = 2008 } @article{qi2009classification, abstract = {Classification of Web page content is essential to many tasks in Web information retrieval such as maintaining Web directories and focused crawling. The uncontrolled nature of Web content presents additional challenges to Web page classification as compared to traditional text classification, but the interconnected nature of hypertext also provides features that can assist the process.

As we review work in Web page classification, we note the importance of these Web-specific features and algorithms, describe state-of-the-art practices, and track the underlying assumptions behind the use of information from neighboring pages.}, acmid = {1459357}, address = {New York, NY, USA}, articleno = {12}, author = {Qi, Xiaoguang and Davison, Brian D.}, doi = {10.1145/1459352.1459357}, interhash = {3e1f5f696040766fa9c1993748cdc465}, intrahash = {a89af712b3597fb94e2a25f309324f4b}, issn = {0360-0300}, issue = {2}, issue_date = {February 2009}, journal = {ACM Comput. Surv.}, month = {February}, numpages = {31}, pages = {12:1--12:31}, publisher = {ACM}, title = {Web page classification: Features and algorithms}, url = {http://doi.acm.org/10.1145/1459352.1459357}, volume = 41, year = 2009 } @incollection{choi2005classification, abstract = {This chapter describes systems that automatically classify web pages into meaningful categories. It first defines two types of web page classification: subject based and genre based classifications. It then describes the state of the art techniques and subsystems used to build automatic web page classification systems, including web page representations, dimensionality reductions, web page classifiers, and evaluation of web page classifiers. Such systems are essential tools for Web Mining and for the future of Semantic Web.}, address = {Berlin / Heidelberg}, affiliation = {Louisiana Tech University Computer Science, College of Engineering and Science Ruston LA 71272 USA}, author = {Choi, B. and Yao, Z.}, booktitle = {Foundations and Advances in Data Mining}, doi = {10.1007/11362197_9}, editor = {Chu, Wesley and Young Lin, Tsau}, interhash = {ee6322086817459b37b7cb774be344cc}, intrahash = {8cae2b481187e15ec17f1aeac3c80a68}, isbn = {978-3-540-25057-9}, keyword = {Engineering}, pages = {221-274}, publisher = {Springer}, series = {Studies in Fuzziness and Soft Computing}, title = {Web Page Classification}, url = {http://dx.doi.org/10.1007/11362197_9}, volume = 180, year = 2005 } @inproceedings{illig2011comparison, abstract = {Recommendation algorithms and multi-class classifiers can support users of social bookmarking systems in assigning tags to their bookmarks. Content based recommenders are the usual approach for facing the cold start problem, i.e., when a bookmark is uploaded for the first time and no information from other users can be exploited. In this paper, we evaluate several recommendation algorithms in a cold-start scenario on a large real-world dataset. }, address = {Berlin/Heidelberg}, author = {Illig, Jens and Hotho, Andreas and Jäschke, Robert and Stumme, Gerd}, booktitle = {Knowledge Processing and Data Analysis}, doi = {10.1007/978-3-642-22140-8_9}, editor = {Wolff, Karl Erich and Palchunov, Dmitry E. and Zagoruiko, Nikolay G. and Andelfinger, Urs}, interhash = {cd3420c0f73761453320dc528b3d1e14}, intrahash = {f9d6e06ab0f2fdcebb77afa97d72e40a}, isbn = {978-3-642-22139-2}, pages = {136--149}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {A Comparison of Content-Based Tag Recommendations in Folksonomy Systems}, url = {http://dx.doi.org/10.1007/978-3-642-22140-8_9}, volume = 6581, year = 2011 } @inproceedings{angelova2006graphbased, abstract = {Automatic classification of data items, based on training samples, can be boosted by considering the neighborhood of data items in a graph structure (e.g., neighboring documents in a hyperlink environment or co-authors and their publications for bibliographic data entries). This paper presents a new method for graph-based classification, with particular emphasis on hyperlinked text documents but broader applicability. Our approach is based on iterative relaxation labeling and can be combined with either Bayesian or SVM classifiers on the feature spaces of the given data items. The graph neighborhood is taken into consideration to exploit locality patterns while at the same time avoiding overfitting. In contrast to prior work along these lines, our approach employs a number of novel techniques: dynamically inferring the link/class pattern in the graph in the run of the iterative relaxation labeling, judicious pruning of edges from the neighborhood graph based on node dissimilarities and node degrees, weighting the influence of edges based on a distance metric between the classification labels of interest and weighting edges by content similarity measures. Our techniques considerably improve the robustness and accuracy of the classification outcome, as shown in systematic experimental comparisons with previously published methods on three different real-world datasets.}, acmid = {1148254}, address = {New York, NY, USA}, author = {Angelova, Ralitsa and Weikum, Gerhard}, booktitle = {Proceedings of the 29th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1148170.1148254}, interhash = {ac882d49fa7067c6a0bfca1583ef29b2}, intrahash = {f4cffa23d78bab3284db2f233c226425}, isbn = {1-59593-369-7}, location = {Seattle, Washington, USA}, numpages = {8}, pages = {485--492}, publisher = {ACM}, series = {SIGIR '06}, title = {Graph-based text classification: learn from your neighbors}, url = {http://doi.acm.org/10.1145/1148170.1148254}, year = 2006 } @article{Chakrabarti:1998:SFS:765529.765533, abstract = {We explore how to organize large text databases hierarchically by topic to aid better searching, browsing and filtering. Many corpora, such as internet directories, digital libraries, and patent databases are manually organized into topic hierarchies, also called taxonomies. Similar to indices for relational data, taxonomies make search and access more efficient. However, the exponential growth in the volume of on-line textual information makes it nearly impossible to maintain such taxonomic organization for large, fast-changing corpora by hand. We describe an automatic system that starts with a small sample of the corpus in which topics have been assigned by hand, and then updates the database with new documents as the corpus grows, assigning topics to these new documents with high speed and accuracy. To do this, we use techniques from statistical pattern recognition to efficiently separate the feature words, or discriminants, from thenoise words at each node of the taxonomy. Using these, we build a multilevel classifier. At each node, this classifier can ignore the large number of “noise” words in a document. Thus, the classifier has a small model size and is very fast. Owing to the use of context-sensitive features, the classifier is very accurate. As a by-product, we can compute for each document a set of terms that occur significantly more often in it than in the classes to which it belongs. We describe the design and implementation of our system, stressing how to exploit standard, efficient relational operations like sorts and joins. We report on experiences with the Reuters newswire benchmark, the US patent database, and web document samples from Yahoo!. We discuss applications where our system can improve searching and filtering capabilities.}, acmid = {765533}, address = {Secaucus, NJ, USA}, author = {Chakrabarti, Soumen and Dom, Byron and Agrawal, Rakesh and Raghavan, Prabhakar}, doi = {10.1007/s007780050061}, interhash = {5969518d9723da79c33437322c06474a}, intrahash = {606e518c01399c1e0569a00a81719343}, issn = {1066-8888}, issue = {3}, journal = {The VLDB Journal}, month = {August}, numpages = {16}, pages = {163--178}, publisher = {Springer-Verlag New York, Inc.}, title = {Scalable feature selection, classification and signature generation for organizing large text databases into hierarchical topic taxonomies}, url = {http://dx.doi.org/10.1007/s007780050061}, volume = 7, year = 1998 } @inproceedings{Shen:2004:WCT:1008992.1009035, abstract = {Web-page classification is much more difficult than pure-text classification due to a large variety of noisy information embedded in Web pages. In this paper, we propose a new Web-page classification algorithm based on Web summarization for improving the accuracy. We first give empirical evidence that ideal Web-page summaries generated by human editors can indeed improve the performance of Web-page classification algorithms. We then propose a new Web summarization-based classification algorithm and evaluate it along with several other state-of-the-art text summarization algorithms on the LookSmart Web directory. Experimental results show that our proposed summarization-based classification algorithm achieves an approximately 8.8% improvement as compared to pure-text-based classification algorithm. We further introduce an ensemble classifier using the improved summarization algorithm and show that it achieves about 12.9% improvement over pure-text based methods.}, acmid = {1009035}, address = {New York, NY, USA}, author = {Shen, Dou and Chen, Zheng and Yang, Qiang and Zeng, Hua-Jun and Zhang, Benyu and Lu, Yuchang and Ma, Wei-Ying}, booktitle = {Proceedings of the 27th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1008992.1009035}, interhash = {328ff5b51cb573cd1d253f339892c029}, intrahash = {b83fca9d43e5afdea78b9791cc07890c}, isbn = {1-58113-881-4}, location = {Sheffield, United Kingdom}, numpages = {8}, pages = {242--249}, publisher = {ACM}, series = {SIGIR '04}, title = {Web-page classification through summarization}, url = {http://doi.acm.org/10.1145/1008992.1009035}, year = 2004 } @inproceedings{liu2005experimental, abstract = {Taxonomies of the Web typically have hundreds of thousands of categories and skewed category distribution over documents. It is not clear whether existing text classification technologies can perform well on and scale up to such large-scale applications. To understand this, we conducted the evaluation of several representative methods (Support Vector Machines, k-Nearest Neighbor and Naive Bayes) with Yahoo! taxonomies. In particular, we evaluated the effectiveness/efficiency tradeoff in classifiers with hierarchical setting compared to conventional (flat) setting, and tested popular threshold tuning strategies for their scalability and accuracy in large-scale classification problems.}, acmid = {1062891}, address = {New York, NY, USA}, author = {LIU, Tie-Yan and YANG, Yiming and WAN, Hao and ZHOU, Qian and GAO, Bin and ZENG, Hua-Jun and CHEN, Zheng and MA, Wei-Ying}, booktitle = {Special interest tracks and posters of the 14th international conference on World Wide Web}, doi = {10.1145/1062745.1062891}, interhash = {e581e4dd2ed6d748031a812c724c4b7c}, intrahash = {36cc9f92b9c722b2aff441b23e44b2f7}, isbn = {1-59593-051-5}, location = {Chiba, Japan}, numpages = {2}, pages = {1106--1107}, publisher = {ACM}, series = {WWW '05}, title = {An experimental study on large-scale web categorization}, url = {http://doi.acm.org/10.1145/1062745.1062891}, year = 2005 } @inproceedings{Dumais:2000:HCW:345508.345593, acmid = {345593}, address = {New York, NY, USA}, author = {Dumais, Susan and Chen, Hao}, booktitle = {Proceedings of the 23rd annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/345508.345593}, interhash = {1051e6db3c79db59699a253138bb3b64}, intrahash = {be8e5ee591f98d95ff6ee2f2f227e3be}, isbn = {1-58113-226-3}, location = {Athens, Greece}, numpages = {8}, pages = {256--263}, publisher = {ACM}, series = {SIGIR '00}, title = {Hierarchical classification of Web content}, url = {http://doi.acm.org/10.1145/345508.345593}, year = 2000 } @inproceedings{Lu:2009:ETN:1645953.1646167, abstract = {In this poster, we investigate how to enhance web clustering by leveraging the tripartite network of social tagging systems. We propose a clustering method, called "Tripartite Clustering", which cluster the three types of nodes (resources, users and tags) simultaneously based on the links in the social tagging network. The proposed method is experimented on a real-world social tagging dataset sampled from del.icio.us. We also compare the proposed clustering approach with K-means. All the clustering results are evaluated against a human-maintained web directory. The experimental results show that Tripartite Clustering significantly outperforms the content-based K-means approach and achieves performance close to that of social annotation-based K-means whereas generating much more useful information.}, acmid = {1646167}, address = {New York, NY, USA}, author = {Lu, Caimei and Chen, Xin and Park, E. K.}, booktitle = {Proceeding of the 18th ACM conference on Information and knowledge management}, doi = {10.1145/1645953.1646167}, interhash = {e192e53972f28d78f1ecbffbfea08bed}, intrahash = {86160cf68758ec60922323a34a7833f0}, isbn = {978-1-60558-512-3}, location = {Hong Kong, China}, numpages = {4}, pages = {1545--1548}, publisher = {ACM}, series = {CIKM '09}, title = {Exploit the tripartite network of social tagging for web clustering}, url = {http://doi.acm.org/10.1145/1645953.1646167}, year = 2009 } @article{Carpineto:2009:SWC:1541880.1541884, abstract = {Web clustering engines organize search results by topic, thus offering a complementary view to the flat-ranked list returned by conventional search engines. In this survey, we discuss the issues that must be addressed in the development of a Web clustering engine, including acquisition and preprocessing of search results, their clustering and visualization. Search results clustering, the core of the system, has specific requirements that cannot be addressed by classical clustering algorithms. We emphasize the role played by the quality of the cluster labels as opposed to optimizing only the clustering structure. We highlight the main characteristics of a number of existing Web clustering engines and also discuss how to evaluate their retrieval performance. Some directions for future research are finally presented.}, acmid = {1541884}, address = {New York, NY, USA}, articleno = {17}, author = {Carpineto, Claudio and Osi\'{n}ski, Stanislaw and Romano, Giovanni and Weiss, Dawid}, doi = {10.1145/1541880.1541884}, interhash = {95beef372c0d7c6f57caf0862896a0bb}, intrahash = {1921bab51019d89a0b740c43d8aafd23}, issn = {0360-0300}, issue = {3}, issue_date = {July 2009}, journal = {ACM Comput. Surv.}, month = {July}, numpages = {38}, pages = {17:1--17:38}, publisher = {ACM}, title = {A survey of Web clustering engines}, url = {http://doi.acm.org/10.1145/1541880.1541884}, volume = 41, year = 2009 } @article{Jain:1999:DCR:331499.331504, acmid = {331504}, address = {New York, NY, USA}, author = {Jain, A. K. and Murty, M. N. and Flynn, P. J.}, doi = {10.1145/331499.331504}, interhash = {5113b61d428d4de4423182e5f2b2f468}, intrahash = {66c50650bf611d5fa9027c839ab788fd}, issn = {0360-0300}, issue = {3}, issue_date = {Sept. 1999}, journal = {ACM Comput. Surv.}, month = {September}, numpages = {60}, pages = {264--323}, publisher = {ACM}, title = {Data clustering: a review}, url = {http://doi.acm.org/10.1145/331499.331504}, volume = 31, year = 1999 } @incollection{solskinnsbakk2010hybrid, abstract = {Folksonomies are becoming increasingly popular. They contain large amounts of data which can be mined and utilized for many tasks like visualization, browsing, information retrieval etc. An inherent problem of folksonomies is the lack of structure. In this paper we present an unsupervised approach for generating such structure based on a combination of association rule mining and the underlying tagged material. Using the underlying tagged material we generate a semantic representation of each tag. The semantic representation of the tags is an integral component of the structure generated. The experiment presented in this paper shows promising results with tag structures that correspond well with human judgment.}, address = {Berlin / Heidelberg}, affiliation = {Department of Computer and Information Science, Norwegian University of Science and Technology, Trondheim, Norway}, author = {Solskinnsbakk, Geir and Gulla, Jon}, booktitle = {On the Move to Meaningful Internet Systems, OTM 2010}, doi = {10.1007/978-3-642-16949-6_22}, editor = {Meersman, Robert and Dillon, Tharam and Herrero, Pilar}, interhash = {c33c0fe08d8ac29e88a4c43b3047c707}, intrahash = {949d497bc5a29eda10c77f5784aed18b}, isbn = {978-3-642-16948-9}, keyword = {Computer Science}, pages = {975-982}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, slides = {http://www.slides.com}, title = {A Hybrid Approach to Constructing Tag Hierarchies}, url = {http://dx.doi.org/10.1007/978-3-642-16949-6_22}, volume = 6427, year = 2010 } @inproceedings{plangprasopchok2010probabilistic, abstract = {Learning structured representations has emerged as an important problem in many domains, including document and Web data mining, bioinformatics, and image analysis. One approach to learning complex structures is to integrate many smaller, incomplete and noisy structure fragments. In this work, we present an unsupervised probabilistic approach that extends affinity propagation to combine the small ontological fragments into a collection of integrated, consistent, and larger folksonomies. This is a challenging task because the method must aggregate similar structures while avoiding structural inconsistencies and handling noise. We validate the approach on a real-world social media dataset, comprised of shallow personal hierarchies specified by many individual users, collected from the photosharing website Flickr. Our empirical results show that our proposed approach is able to construct deeper and denser structures, compared to an approach using only the standard affinity propagation algorithm. Additionally, the approach yields better overall integration quality than a state-of-the-art approach based on incremental relational clustering. }, author = {Plangprasopchok, Anon and Lerman, Kristina and Getoor, Lise}, booktitle = {Proceedings of the 4th ACM Web Search and Data Mining Conference}, interhash = {826359ec25dcd228ad3ef46dcc6d26c5}, intrahash = {455bb173bb33af58bc8aaed48d8a8513}, note = {cite arxiv:1011.3557Comment: In Proceedings of the 4th ACM Web Search and Data Mining Conference (WSDM)}, title = {A Probabilistic Approach for Learning Folksonomies from Structured Data}, url = {http://arxiv.org/abs/1011.3557}, year = 2010 } @inproceedings{turney2001mining, author = {Turney, Peter}, interhash = {8e5ac4302379bb3e66512a3696669bcb}, intrahash = {a182202ecc7247a9567f12609f09ec7d}, title = {Mining the Web for Synonyms: PMI-IR Versus LSA on TOEFL}, url = {http://nparc.cisti-icist.nrc-cnrc.gc.ca/npsi/ctrl?action=shwart&index=an&req=5765594&lang=en}, year = 2001 } @misc{zlati2009hypergraph, abstract = { Recent years have witnessed the emergence of a new class of social networks, that require us to move beyond previously employed representations of complex graph structures. A notable example is that of the folksonomy, an online process where users collaboratively employ tags to resources to impart structure to an otherwise undifferentiated database. In a recent paper[1] we proposed a mathematical model that represents these structures as tripartite hypergraphs and defined basic topological quantities of interest. In this paper we extend our model by defining additional quantities such as edge distributions, vertex similarity and correlations as well as clustering. We then empirically measure these quantities on two real life folksonomies, the popular online photo sharing site Flickr and the bookmarking site CiteULike. We find that these systems share similar qualitative features with the majority of complex networks that have been previously studied. We propose that the quantities and methodology described here can be used as a standard tool in measuring the structure of tagged networks. }, author = {Zlatic, Vinko and Ghoshal, Gourab and Caldarelli, Guido}, interhash = {7dfecf6636651d0b42d569a2ebb5212c}, intrahash = {6b9803ae1a6f0c369d0420c7cf7b3559}, note = {cite arxiv:0905.0976Comment: 8 pages, 9 figures, revtex}, title = {Hypergraph topological quantities for tagged social networks}, url = {http://arxiv.org/abs/0905.0976}, year = 2009 } @article{strohmaier2011evaluation, author = {Strohmaier, Markus and Helic, Denis and Benz, Dominik and Körner, Christian and Kern, Roman}, interhash = {87e110b0ade230877db6855cacabcb4d}, intrahash = {603161eb4c5b2f87f3d3a50f87015337}, journal = {Transactions on Intelligent Systems and Technology}, title = {Evaluation of Folksonomy Induction Algorithms}, url = {http://tist.acm.org/index.html}, vgwort = {43}, year = 2012 } @article{evans2008electronic, abstract = {Online journals promise to serve more information to more dispersed audiences and are more efficiently searched and recalled. But because they are used differently than print—scientists and scholars tend to search electronically and follow hyperlinks rather than browse or peruse—electronically available journals may portend an ironic change for science. Using a database of 34 million articles, their citations (1945 to 2005), and online availability (1998 to 2005), I show that as more journal issues came online, the articles referenced tended to be more recent, fewer journals and articles were cited, and more of those citations were to fewer journals and articles. The forced browsing of print archives may have stretched scientists and scholars to anchor findings deeply into past and present scholarship. Searching online is more efficient and following hyperlinks quickly puts researchers in touch with prevailing opinion, but this may accelerate consensus and narrow the range of findings and ideas built upon.}, author = {Evans, James A.}, doi = {10.1126/science.1150473}, file = {AAAS online:2008/Evans08science.pdf:PDF}, interhash = {b0070b6e1539176aa6fc73bd1db02dcb}, intrahash = {857192905e9637d9fb31288c03d7f83e}, issn = {0036-8075}, journal = {Science}, month = {#jul#}, number = 5887, pages = {395-399}, title = {Electronic Publication and the Narrowing of Science and Scholarship}, volume = 321, year = 2008 }