@inproceedings{bullock2011privacyaware, abstract = {With the increased popularity of Web 2.0 services in the last years data privacy has become a major concern for users. The more personal data users reveal, the more difficult it becomes to control its disclosure in the web. However, for Web 2.0 service providers, the data provided by users is a valuable source for offering effective, personalised data mining services. One major application is the detection of spam in social bookmarking systems: in order to prevent a decrease of content quality, providers need to distinguish spammers and exclude them from the system. They thereby experience a conflict of interests: on the one hand, they need to identify spammers based on the information they collect about users, on the other hand, they need to respect privacy concerns and process as few personal data as possible. It would therefore be of tremendous help for system developers and users to know which personal data are needed for spam detection and which can be ignored. In this paper we address these questions by presenting a data privacy aware feature engineering approach. It consists of the design of features for spam classification which are evaluated according to both, performance and privacy conditions. Experiments using data from the social bookmarking system BibSonomy show that both conditions must not exclude each other.}, acmid = {2024306}, address = {New York, NY, USA}, articleno = {15}, author = {Bullock, Beate Navarro and Lerch, Hana and Ro\ssnagel, Alexander and Hotho, Andreas and Stumme, Gerd}, booktitle = {Proceedings of the 11th International Conference on Knowledge Management and Knowledge Technologies}, doi = {10.1145/2024288.2024306}, interhash = {7a2d6a35c124ea0fe31c962f8f150916}, intrahash = {00a8f31185a34957eb16d500d7d51398}, isbn = {978-1-4503-0732-1}, location = {Graz, Austria}, numpages = {8}, pages = {15:1--15:8}, publisher = {ACM}, series = {i-KNOW '11}, title = {Privacy-aware spam detection in social bookmarking systems}, url = {http://doi.acm.org/10.1145/2024288.2024306}, year = 2011 } @article{borges2011classificationbased, abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. }, author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata}, editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria}, interhash = {ca7720210214f632758211735154eea2}, intrahash = {8f87206e413c2c632b5c633f484fcbe2}, journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 }, pages = {221-228}, title = {A Classification-based Approach for Bibliographic Metadata Deduplication}, url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf}, year = 2011 } @inproceedings{Yang2006, author = {Yang, Hui and Callan, James P.}, booktitle = {SIGIR}, crossref = {conf/sigir/2006}, editor = {Efthimiadis, Efthimis N. and Dumais, Susan T. and Hawking, David and Järvelin, Kalervo}, ee = {http://doi.acm.org/10.1145/1148170.1148243}, interhash = {0703044e3abd1580680e66f2355813c6}, intrahash = {27e76ac1174db2a3ee4a3efd34bb2e16}, isbn = {1-59593-369-7}, pages = {421-428}, publisher = {ACM}, title = {Near-duplicate detection by instance-level constrained clustering.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2006.html#YangC06}, year = 2006 } @article{cousins1998duplicate, abstract = {COPAC is a union catalog giving access to the online catalog records of some of the largest academic research libraries in the United Kingdom and Ireland. Discussion includes ways in which duplicate detection and record consolidation procedures are carried out, along with problem areas encountered. (Author/AEF)}, author = {Cousins, Shirley Anne}, interhash = {6880df322e69a00af4df1466c7730e7a}, intrahash = {a1067917a86f9aaaa1d5610ae113436c}, issn = {01655515}, journal = {Journal of Information Science}, number = 4, pages = {231--40}, refid = {EJ573940}, title = {Duplicate Detection and Record Consolidation in Large Bibliographic Databases: The COPAC Database Experience.}, url = {http://www.eric.ed.gov/ERICWebPortal/detail?accno=EJ573940}, volume = 24, year = 1998 } @misc{kapidakis2008duplicate, author = {Sitas, Anestis and Kapidakis, Sarantos}, interhash = {94c3f69a754778b492d725bb08ffc0fb}, intrahash = {633b89b5a6827d28513545282f9f8bc7}, journal = {Library Hi Tech}, pages = {pp. 287-301}, title = {Duplicate detection algorithms of bibliographic descriptions}, url = {http://www.ionio.gr/~sarantos/repository/j21J-LibraryHiTech-Sitas.pdf}, volume = {Vol. 26 Iss: 2}, year = 2008 } @misc{Sarma2011, abstract = { De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. }, author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip}, interhash = {3f32848ef4bb26a3057c3feadff99c5a}, intrahash = {389dba4432b1340211ef6be8e3d45a1d}, note = {cite arxiv:1111.3689}, title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication Tasks}, url = {http://arxiv.org/abs/1111.3689}, year = 2011 } @article{10.1109/ICMV.2009.43, address = {Los Alamitos, CA, USA}, author = {Rehman, Mariam and Esichaikul, Vatcharapon}, doi = {10.1109/ICMV.2009.43,}, interhash = {fa5b4188783fae5543eb14602d083e06}, intrahash = {d14c2c587d32c0c91184183298683c10}, isbn = {978-0-7695-3944-7}, journal = {Machine Vision, International Conference on}, pages = {333-338}, publisher = {IEEE Computer Society}, title = {Duplicate Record Detection for Database Cleansing}, url = {http://www.computer.org/portal/web/csdl/doi/10.1109/ICMV.2009.43}, volume = 0, year = 2009 } @book{noauthororeditoryahoo, abstract = {The past decade has witnessed the emergence of participatory Web and social media, bringing people together in many creative ways. Millions of users are playing, tagging, working, and socializing online, demonstrating new forms of collaboration, communication, and intelligence that were hardly imaginable just a short time ago. Social media also helps reshape business models, sway opinions and emotions, and opens up numerous possibilities to study human interaction and collective behavior in an unparalleled scale. This lecture, from a data mining perspective, introduces characteristics of social media, reviews representative tasks of computing with social media, and illustrates associated challenges. It introduces basic concepts, presents state-of-the-art algorithms with easy-to-understand examples, and recommends effective evaluation methods. In particular, we discuss graph-based community detection techniques and many important extensions that handle dynamic, heterogeneous networks in social media. We also demonstrate how discovered patterns of communities can be used for social media mining. The concepts, algorithms, and methods presented in this lecture can help harness the power of social media and support building socially-intelligent systems. This book is an accessible introduction to the study of \emph{community detection and mining in social media}. It is an essential reading for students, researchers, and practitioners in disciplines and applications where social media is a key source of data that piques our curiosity to understand, manage, innovate, and excel. This book is supported by additional materials, including lecture slides, the complete set of figures, key references, some toy data sets used in the book, and the source code of representative algorithms. The readers are encouraged to visit the book website for the latest information. Table of Contents: Social Media and Social Computing / Nodes, Ties, and Influence / Community Detection and Evaluation / Communities in Heterogeneous Networks / Social Media Mining }, author = {Tang‌, Lei and Liu‌, Huan}, doi = {10.2200/S00298ED1V01Y201009DMK003}, interhash = {717f8b976eec1dc934a3b84675456f25}, intrahash = {c4e1fa6bf2d52a237e5557640d87c970}, title = {Community Detection and Mining in Social Media}, url = {http://www.morganclaypool.com/doi/abs/10.2200/S00298ED1V01Y201009DMK003}, year = 2010 } @inproceedings{morishima2009bringing, abstract = {This paper presents an experimental study of the automatic correction of broken (dead) Web links focusing, in particular, on links broken by the relocation of Web pages. Our first contribution is that we developed an algorithm that incorporates a comprehensive set of heuristics, some of which are novel, in a single unified framework. The second contribution is that we conducted a relatively large-scale experiment, and analysis of our results revealed the characteristics of the problem of finding moved Web pages. We demonstrated empirically that the problem of searching for moved pages is different from typical information retrieval problems. First, it is impossible to identify the final destination until the page is moved, so the index-server approach is not necessarily effective. Secondly, there is a large bias about where the new address is likely to be and crawler-based solutions can be effectively implemented, avoiding the need to search the entire Web. We analyzed the experimental results in detail to show how important each heuristic is in real Web settings, and conducted statistical analyses to show that our algorithm succeeds in correctly finding new links for more than 70% of broken links at 95% confidence level.}, address = {New York, NY, USA}, author = {Morishima, Atsuyuki and Nakamizo, Akiyoshi and Iida, Toshinari and Sugimoto, Shigeo and Kitagawa, Hiroyuki}, booktitle = {HT '09: Proceedings of the Twentieth ACM Conference on Hypertext and Hypermedia}, interhash = {4ee86291d78698b239edee7c54f967fe}, intrahash = {73faf07a4750d730ce0455e5613b8687}, month = {July}, paperid = {fp038}, publisher = {ACM}, session = {Full Paper}, title = {Bringing Your Dead Links Back to Life: A Comprehensive Approach and Lessons Learned}, year = 2009 } @inproceedings{Detecting_Commmunities_via_Simultaneous_Clustering_of_Graphs_and_Folksonomies, author = {Java, Akshay and Joshi, Anupam and Finin, Tim}, booktitle = {WebKDD 2008 Workshop on Web Mining and Web Usage Analysis}, interhash = {acfec953843b168e61e2e167e29b4c3d}, intrahash = {645abd6b3191a2a6e844d7542651ed1c}, month = {August}, note = {To Appear}, title = {{Detecting Commmunities via Simultaneous Clustering of Graphs and Folksonomies}}, year = 2008 } @inproceedings{xin2008www, abstract = {The success and popularity of social network systems, such as del.icio.us, Facebook, MySpace, and YouTube, have generated many interesting and challenging problems to the research community. Among others, discovering social interests shared by groups of users is very important because it helps to connect people with common interests and encourages people to contribute and share more contents. The main challenge to solving this problem comes from the diffi- culty of detecting and representing the interest of the users. The existing approaches are all based on the online connections of users and so unable to identify the common interest of users who have no online connections. In this paper, we propose a novel social interest discovery approach based on user-generated tags. Our approach is motivated by the key observation that in a social network, human users tend to use descriptive tags to annotate the contents that they are interested in. Our analysis on a large amount of real-world traces reveals that in general, user-generated tags are consistent with the web content they are attached to, while more concise and closer to the understanding and judgments of human users about the content. Thus, patterns of frequent co-occurrences of user tags can be used to characterize and capture topics of user interests. We have developed an Internet Social Interest Discovery system, ISID, to discover the common user interests and cluster users and their saved URLs by different interest topics. Our evaluation shows that ISID can effectively cluster similar documents by interest topics and discover user communities with common interests no matter if they have any online connections.}, author = {Li, Xin and Guo, Lei and Zhao, Yihong E.}, booktitle = {Proceedings of the 17th International World Wide Web Conference}, interhash = {d7e6a5b8d215682b2a75add69c01de29}, intrahash = {42b4c94cff05ccef031235d661a7a77a}, pages = {675-684}, publisher = {ACM}, title = {Tag-based Social Interest Discovery}, url = {http://www2008.org/papers/pdf/p675-liA.pdf}, year = 2008 } @inproceedings{1281269, address = {New York, NY, USA}, author = {Tantipathananandh, Chayant and Berger-Wolf, Tanya and Kempe, David}, booktitle = {KDD '07: Proceedings of the 13th ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {http://doi.acm.org/10.1145/1281192.1281269}, interhash = {9373b48866b4faa1941db0bee9265af0}, intrahash = {27a4fb58300979d4dbe94e75422418bd}, isbn = {978-1-59593-609-7}, location = {San Jose, California, USA}, pages = {717--726}, publisher = {ACM}, title = {A framework for community identification in dynamic social networks}, url = {http://portal.acm.org/citation.cfm?doid=1281192.1281269}, year = 2007 } @inproceedings{Approximating2008Java, abstract = {In many social media applications, a small fraction of the members are highly linked while most are sparsely connected to the network. Such a skewed distribution is sometimes referred to as the"long tail". Popular applications like meme trackers and content aggregators mine for information from only the popular blogs located at the head of this curve. On the other hand, the long tail contains large volumes of interesting information and niches. The question we address in this work is how best to approximate the community membership of entities in the long tail using only a small percentage of the entire graph structure. Our technique utilizes basic linear algebra manipulations and spectral methods. It has the advantage of quickly and efficiently finding a reasonable approximation of the community structure of the overall network. Such a method has significant applications in blog analysis engines as well as social media monitoring tools in general. }, author = {Java, Akshay and Joshi, Anupam and FininBook, Tim}, booktitle = {Proceedings of the Second International Conference on Weblogs and Social Media(ICWSM 2008)}, date = {2008 Abstract:}, interhash = {ede357e110fee8803dc181d262f30087}, intrahash = {386f36679c111f30e37ced272d5b355c}, publisher = {AAAI Press}, title = {Approximating the Community Structure of the Long Tail}, url = {http://ebiquity.umbc.edu/paper/html/id/381/Approximating-the-Community-Structure-of-the-Long-Tail}, year = 2008 } @inproceedings{hotho06trend, author = {Hotho, Andreas and Jäschke, Robert and Schmitz, Christoph and Stumme, Gerd}, booktitle = {Proc. First International Conference on Semantics And Digital Media Technology (SAMT)}, date = {2006-12-13}, editor = {Avrithis, Yannis S. and Kompatsiaris, Yiannis and Staab, Steffen and O'Connor, Noel E.}, ee = {http://dx.doi.org/10.1007/11930334_5}, interhash = {227be738c5cea57530d592463fd09abd}, intrahash = {2df7426d8ae0bd65c6f095d3fc8a703e}, isbn = {3-540-49335-2}, pages = {56-70}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Trend Detection in Folksonomies}, url = {http://dblp.uni-trier.de/db/conf/samt/samt2006.html#HothoJSS06}, vgwort = {27}, volume = 4306, year = 2006 } @article{journals/tois/ChowdhuryFGM02, author = {Chowdhury, Abdur and Frieder, Ophir and Grossman, David A. and McCabe, M. Catherine}, date = {2003-11-25}, ee = {http://doi.acm.org/10.1145/506309.506311}, interhash = {4357db306875755262451e702911ebe0}, intrahash = {24249e2a7b8b809050f9083fc75d3c18}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {171-191}, title = {Collection statistics for fast duplicate document detection.}, url = {http://dblp.uni-trier.de/db/journals/tois/tois20.html#ChowdhuryFGM02}, volume = 20, year = 2002 } @inproceedings{conf/pkdd/CaiSHYH05, author = {Cai, Deng and Shao, Zheng and He, Xiaofei and Yan, Xifeng and Han, Jiawei}, booktitle = {PKDD}, crossref = {conf/pkdd/2005}, date = {2005-11-14}, editor = {Jorge, Alípio and Torgo, Luís and Brazdil, Pavel and Camacho, Rui and Gama, João}, ee = {http://dx.doi.org/10.1007/11564126_44}, interhash = {ffbc54eea27012bcd133f5b96fde010f}, intrahash = {6045c9e8efdcbe1578f1f769c6b9274b}, isbn = {3-540-29244-6}, pages = {445-452}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Community Mining from Multi-relational Networks.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2005.html#CaiSHYH05}, volume = 3721, year = 2005 } @inproceedings{1166191, address = {New York, NY, USA}, author = {Hidalgo, José María Gómez and Bringas, Guillermo Cajigas and Sánz, Enrique Puertas and García, Francisco Carrero}, booktitle = {DocEng '06: Proceedings of the 2006 ACM symposium on Document engineering}, doi = {http://doi.acm.org/10.1145/1166160.1166191}, interhash = {6c439b2270b1bc2f5a3d6e0fa9eb1ae0}, intrahash = {45ac4e6a34bb50c2061a29e52079b576}, isbn = {1-59593-515-0}, location = {Amsterdam, The Netherlands}, pages = {107--114}, publisher = {ACM Press}, title = {Content based SMS spam filtering}, url = {http://portal.acm.org/ft_gateway.cfm?id=1166191&type=pdf&coll=&dl=acm&CFID=15151515&CFTOKEN=6184618}, year = 2006 } @article{journals/cn/BroderGMZ97, author = {Broder, Andrei Z. and Glassman, Steven C. and Manasse, Mark S. and Zweig, Geoffrey}, date = {2003-11-27}, ee = {http://dx.doi.org/10.1016/S0169-7552(97)00031-7}, interhash = {424cdc36335873e4d8c0bed6e07e872e}, intrahash = {b88a36c088beef971845324c862599d0}, journal = {Computer Networks}, number = {8-13}, pages = {1157-1166}, title = {Syntactic Clustering of the Web.}, url = {http://dblp.uni-trier.de/db/journals/cn/cn29.html#BroderGMZ97}, volume = 29, year = 1997 } @techreport{284, author = {Gyongyi, Z. and Berkhin, P. and Garcia-Molina, H. and Pedersen, J.}, institution = {Stanford Univ.}, interhash = {987e3d3fd3c529a2662a5387bc568793}, intrahash = {4dda644faa9132ef2f09ac8a13f11d75}, title = {Link spam detection based on mass estimation}, url = {http://infolab.stanford.edu/~zoltan/publications/gyongyi2006link.pdf}, year = 2005 } @article{reichardt-2004-93, author = {Reichardt, Joerg and Bornholdt, Stefan}, interhash = {d424bcc57ba04601143ad2aae05c2def}, intrahash = {7463a040a13328d06cf0f5b0f32ae85a}, journal = {Physical Review Letters}, pages = 218701, title = {Detecting fuzzy community structures in complex networks with a Potts model}, url = {http://www.citebase.org/abstract?id=oai:arXiv.org:cond-mat/0402349}, volume = 93, year = 2004 }