@article{borges2011classificationbased, abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. }, author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata}, editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria}, interhash = {ca7720210214f632758211735154eea2}, intrahash = {8f87206e413c2c632b5c633f484fcbe2}, journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 }, pages = {221-228}, title = {A Classification-based Approach for Bibliographic Metadata Deduplication}, url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf}, year = 2011 } @article{Borges_DeCarvalho_Galante_Gonalves_Laender_2011, author = {Borges, Eduardo N and De Carvalho, Moisés G and Galante, Renata and Gonçalves, Marcos André and Laender, Alberto H F}, interhash = {0271248d1218f087a643c4aa906607f9}, intrahash = {e7bc9412f92dddbfd5eaf81648ac5849}, journal = {Information Processing & Management}, number = 5, pages = {706--718}, publisher = {Elsevier Ltd}, title = {An unsupervised heuristic-based approach for bibliographic metadata deduplication}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0306457311000100}, volume = 47, year = 2011 } @inproceedings{conf/nddl/JiangM01, author = {Jiang, Xiaoyi and Mojon, Daniel}, booktitle = {NDDL}, crossref = {conf/nddl/2001}, editor = {Isaías, Pedro T.}, interhash = {8b59b4f98b226ee20294ca7d56abe105}, intrahash = {b8efa49fddc744c36debf5a31b8142b4}, isbn = {972-98050-4-0}, pages = {79-88}, publisher = {ICEIS Press}, title = {Filtering Duplicate Publications in Bibliographic Databases.}, url = {http://dblp.uni-trier.de/db/conf/nddl/nddl2001.html#JiangM01}, year = 2001 } @inproceedings{Yang2006, author = {Yang, Hui and Callan, James P.}, booktitle = {SIGIR}, crossref = {conf/sigir/2006}, editor = {Efthimiadis, Efthimis N. and Dumais, Susan T. and Hawking, David and Järvelin, Kalervo}, ee = {http://doi.acm.org/10.1145/1148170.1148243}, interhash = {0703044e3abd1580680e66f2355813c6}, intrahash = {27e76ac1174db2a3ee4a3efd34bb2e16}, isbn = {1-59593-369-7}, pages = {421-428}, publisher = {ACM}, title = {Near-duplicate detection by instance-level constrained clustering.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2006.html#YangC06}, year = 2006 } @article{journals/is/Goyal87, author = {Goyal, Pankaj}, ee = {http://dx.doi.org/10.1016/0306-4379(87)90002-0}, interhash = {10c6a8bffb2ffa6633ab2e64eb7fc43e}, intrahash = {fc0cb18a9ce7efd3659c07f5e3c01541}, journal = {Inf. Syst.}, number = 3, pages = {239-242}, title = {Duplicate record identification in bibliographic databases.}, url = {http://dblp.uni-trier.de/db/journals/is/is12.html#Goyal87}, volume = 12, year = 1987 } @article{cousins1998duplicate, abstract = {COPAC is a union catalog giving access to the online catalog records of some of the largest academic research libraries in the United Kingdom and Ireland. Discussion includes ways in which duplicate detection and record consolidation procedures are carried out, along with problem areas encountered. (Author/AEF)}, author = {Cousins, Shirley Anne}, interhash = {6880df322e69a00af4df1466c7730e7a}, intrahash = {a1067917a86f9aaaa1d5610ae113436c}, issn = {01655515}, journal = {Journal of Information Science}, number = 4, pages = {231--40}, refid = {EJ573940}, title = {Duplicate Detection and Record Consolidation in Large Bibliographic Databases: The COPAC Database Experience.}, url = {http://www.eric.ed.gov/ERICWebPortal/detail?accno=EJ573940}, volume = 24, year = 1998 } @misc{kapidakis2008duplicate, author = {Sitas, Anestis and Kapidakis, Sarantos}, interhash = {94c3f69a754778b492d725bb08ffc0fb}, intrahash = {633b89b5a6827d28513545282f9f8bc7}, journal = {Library Hi Tech}, pages = {pp. 287-301}, title = {Duplicate detection algorithms of bibliographic descriptions}, url = {http://www.ionio.gr/~sarantos/repository/j21J-LibraryHiTech-Sitas.pdf}, volume = {Vol. 26 Iss: 2}, year = 2008 } @inproceedings{Padmasree06, address = {Alexandria}, author = {Padmasree, Lam and Ambati, Vamshi and Chandulal, Jasthi Anand and Rao, Meda Sreenivasa}, booktitle = {Proceedings of the 2nd ICUDL}, editor = {Serageldin, Ismail and Reddy, Raj}, interhash = {06c68062346c3b1634c950a8bf96deec}, intrahash = {6369260b8ed58d9445b8d2df0a1864f4}, title = {Signature Based Duplicate Detection in Digital Libraries}, url = {http://www.ulib.org/conference/2006/25.pdf}, year = 2006 } @inproceedings{conf/gfkl/PotthastS07, author = {Potthast, Martin and Stein, Benno}, booktitle = {GfKl}, crossref = {conf/gfkl/2007}, editor = {Preisach, Christine and Burkhardt, Hans and Schmidt-Thieme, Lars and Decker, Reinhold}, ee = {http://dx.doi.org/10.1007/978-3-540-78246-9_71}, interhash = {3686fe6dcbfc3683234edb5d1d7aad05}, intrahash = {2eeececfe9ce4c4956142231523df00a}, isbn = {978-3-540-78239-1}, pages = {601-609}, publisher = {Springer}, series = {Studies in Classification, Data Analysis, and Knowledge Organization}, title = {New Issues in Near-duplicate Detection.}, url = {http://www.uni-weimar.de/medien/webis/publications/papers/stein_2008d.pdf}, year = 2007 } @misc{Sarma2011, abstract = { De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. }, author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip}, interhash = {3f32848ef4bb26a3057c3feadff99c5a}, intrahash = {389dba4432b1340211ef6be8e3d45a1d}, note = {cite arxiv:1111.3689}, title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication Tasks}, url = {http://arxiv.org/abs/1111.3689}, year = 2011 } @article{Elmagarmid2007Duplicate, abstract = {Often, in the real world, entities have two or more representations in databases. Duplicate records do not share a common key and/or they contain errors that make duplicate matching a difficult task. Errors are introduced as the result of transcription errors, incomplete information, lack of standard formats, or any combination of these factors. In this paper, we present a thorough analysis of the literature on duplicate record detection. We cover similarity metrics that are commonly used to detect similar field entries, and we present an extensive set of duplicate detection algorithms that can detect approximately duplicate records in a database. We also cover multiple techniques for improving the efficiency and scalability of approximate duplicate detection algorithms. We conclude with coverage of existing tools and with a brief discussion of the big open problems in the area}, author = {Elmagarmid, A. K. and Ipeirotis, P. G. and Verykios, V. S.}, citeulike-article-id = {1116298}, interhash = {c8603198a5bd3d2e571462e08f50e12b}, intrahash = {bfff8a370abdf14f7f882f87c1ff61e1}, journal = {Knowledge and Data Engineering, IEEE Transactions on}, number = 1, pages = {1--16}, posted-at = {2008-02-06 12:37:40}, priority = {5}, title = {Duplicate Record Detection: A Survey}, url = {http://ieeexplore.ieee.org/xpls/abs\_all.jsp?arnumber=4016511}, volume = 19, year = 2007 } @article{10.1109/ICMV.2009.43, address = {Los Alamitos, CA, USA}, author = {Rehman, Mariam and Esichaikul, Vatcharapon}, doi = {10.1109/ICMV.2009.43,}, interhash = {fa5b4188783fae5543eb14602d083e06}, intrahash = {d14c2c587d32c0c91184183298683c10}, isbn = {978-0-7695-3944-7}, journal = {Machine Vision, International Conference on}, pages = {333-338}, publisher = {IEEE Computer Society}, title = {Duplicate Record Detection for Database Cleansing}, url = {http://www.computer.org/portal/web/csdl/doi/10.1109/ICMV.2009.43}, volume = 0, year = 2009 } @inproceedings{bibkey2009, abstract = {This poster presents a set of hash keys for bibliographic records called bibkeys. Unlike other methods of duplicate detection, bibkeys can directly be calculated from a set of basic metadata fields (title, authors/editors, year). It is shown how bibkeys are used to map similar bibliographic records in BibSonomy and among distributed library catalogs and other distributed databases.}, author = {Voss, Jakob and Hotho, Andreas and Jäschke, Robert}, booktitle = {Information: Droge, Ware oder Commons?}, editor = {Kuhlen, Rainer}, interhash = {6e394e459d11dfa17f5d4cf1b8dd81c3}, intrahash = {01f6fe57f46e4b92fe02869341efdd8d}, organization = {Hochschulverband Informationswissenschaft}, publisher = {Verlag Werner Hülsbusch}, series = {Proceedings of the ISI}, title = {Mapping Bibliographic Records with Bibliographic Hash Keys}, url = {http://eprints.rclis.org/15953/}, year = 2009 } @article{journals/tois/ChowdhuryFGM02, author = {Chowdhury, Abdur and Frieder, Ophir and Grossman, David A. and McCabe, M. Catherine}, date = {2003-11-25}, ee = {http://doi.acm.org/10.1145/506309.506311}, interhash = {4357db306875755262451e702911ebe0}, intrahash = {24249e2a7b8b809050f9083fc75d3c18}, journal = {ACM Trans. Inf. Syst.}, number = 2, pages = {171-191}, title = {Collection statistics for fast duplicate document detection.}, url = {http://dblp.uni-trier.de/db/journals/tois/tois20.html#ChowdhuryFGM02}, volume = 20, year = 2002 } @article{journals/cn/BroderGMZ97, author = {Broder, Andrei Z. and Glassman, Steven C. and Manasse, Mark S. and Zweig, Geoffrey}, date = {2003-11-27}, ee = {http://dx.doi.org/10.1016/S0169-7552(97)00031-7}, interhash = {424cdc36335873e4d8c0bed6e07e872e}, intrahash = {b88a36c088beef971845324c862599d0}, journal = {Computer Networks}, number = {8-13}, pages = {1157-1166}, title = {Syntactic Clustering of the Web.}, url = {http://dblp.uni-trier.de/db/journals/cn/cn29.html#BroderGMZ97}, volume = 29, year = 1997 }