@article{tejada2001learning, abstract = {When integrating information from multiple websites, the same data objects can exist in inconsistent text formats across sites, making it difficult to identify matching objects using exact text match. We have developed an object identification system called Active Atlas, which compares the objects’ shared attributes in order to identify matching objects. Certain attributes are more important for deciding if a mapping should exist between two objects. Previous methods of object identification have required manual construction of object identification rules or mapping rules for determining the mappings between objects. This manual process is time consuming and error-prone. In our approach. Active Atlas learns to tailor mapping rules, through limited user input, to a specific application domain. The experimental results demonstrate that we achieve higher accuracy and require less user involvement than previous methods across various application domains.}, author = {Tejada, Sheila and Knoblock, Craig A and Minton, Steven}, doi = {10.1016/S0306-4379(01)00042-4}, interhash = {f9f59187b0397a0fbe1e558dfb4ad9cf}, intrahash = {5ad46801d602408ce271276f452263a9}, issn = {0306-4379}, journal = {Information Systems}, month = dec, number = 8, pages = {607--633}, title = {Learning object identification rules for information integration}, url = {http://www.sciencedirect.com/science/article/pii/S0306437901000424}, volume = 26, year = 2001 } @article{borges2011classification, abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. }, author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata}, editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria}, interhash = {ca7720210214f632758211735154eea2}, intrahash = {8f87206e413c2c632b5c633f484fcbe2}, journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 }, pages = {221--228}, title = {A Classification-based Approach for Bibliographic Metadata Deduplication}, url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf}, year = 2011 } @inproceedings{voss2009mapping, abstract = {This poster presents a set of hash keys for bibliographic records called bibkeys. Unlike other methods of duplicate detection, bibkeys can directly be calculated from a set of basic metadata fields (title, authors/editors, year). It is shown how bibkeys are used to map similar bibliographic records in BibSonomy and among distributed library catalogs and other distributed databases.}, author = {Voss, Jakob and Hotho, Andreas and Jäschke, Robert}, booktitle = {Information: Droge, Ware oder Commons?}, editor = {Kuhlen, Rainer}, interhash = {6e394e459d11dfa17f5d4cf1b8dd81c3}, intrahash = {01f6fe57f46e4b92fe02869341efdd8d}, organization = {Hochschulverband Informationswissenschaft}, publisher = {Verlag Werner Hülsbusch}, series = {Proceedings of the ISI}, title = {Mapping Bibliographic Records with Bibliographic Hash Keys}, url = {http://eprints.rclis.org/15953/}, year = 2009 }