@inproceedings{Sautter:2012:IBR:2403832.2403883, abstract = {Parsing details like author names and titles out of bibliographic references of scientific publications is an important issue. However, most existing techniques are tailored to the highly standardized reference styles used in the last two to three decades. Their performance tends to degrade when faced with the wider variety of reference styles used in older, historic publications. Thus, existing techniques are of limited use when creating comprehensive bibliographies covering both historic and contemporary scientific publications. This paper presents RefParse, a generic approach to bibliographic reference parsing that is independent of any specific reference style. Its core feature is an inference mechanism that exploits the regularities inherent in any list of references to deduce its format. Our evaluation shows that RefParse outperforms existing parsers both for contemporary and for historic reference lists.}, acmid = {2403883}, address = {Berlin, Heidelberg}, author = {Sautter, Guido and B\"{o}hm, Klemens}, booktitle = {Proceedings of the Second International Conference on Theory and Practice of Digital Libraries}, doi = {10.1007/978-3-642-33290-6_40}, interhash = {20fe241af3945dca2e242ae72eae05ad}, intrahash = {ce9a27e85a0cc6bef109d5130e7ed1ea}, isbn = {978-3-642-33289-0}, location = {Paphos, Cyprus}, numpages = {13}, pages = {370--382}, publisher = {Springer-Verlag}, series = {TPDL'12}, title = {Improved Bibliographic Reference Parsing Based on Repeated Patterns}, url = {http://dx.doi.org/10.1007/978-3-642-33290-6_40}, year = 2012 } @article{borges2011classification, abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. }, author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata}, editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria}, interhash = {ca7720210214f632758211735154eea2}, intrahash = {8f87206e413c2c632b5c633f484fcbe2}, journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 }, pages = {221--228}, title = {A Classification-based Approach for Bibliographic Metadata Deduplication}, url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf}, year = 2011 } @article{borges2011classificationbased, abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. }, author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata}, editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria}, interhash = {ca7720210214f632758211735154eea2}, intrahash = {8f87206e413c2c632b5c633f484fcbe2}, journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 }, pages = {221-228}, title = {A Classification-based Approach for Bibliographic Metadata Deduplication}, url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf}, year = 2011 } @article{Borges_DeCarvalho_Galante_Gonalves_Laender_2011, author = {Borges, Eduardo N and De Carvalho, Moisés G and Galante, Renata and Gonçalves, Marcos André and Laender, Alberto H F}, interhash = {0271248d1218f087a643c4aa906607f9}, intrahash = {e7bc9412f92dddbfd5eaf81648ac5849}, journal = {Information Processing & Management}, number = 5, pages = {706--718}, publisher = {Elsevier Ltd}, title = {An unsupervised heuristic-based approach for bibliographic metadata deduplication}, url = {http://linkinghub.elsevier.com/retrieve/pii/S0306457311000100}, volume = 47, year = 2011 } @article{ayres2007twentyfive, abstract = {This article describes cooperation between Bradford University Library and the Department of Computing that has resulted in nine research projects over a twenty-five year period on various aspects of bibliographic control. It recounts the origins of the Universal Standard Bibliographic Code (USBC) and its development for the identification of both books and non-book material. It then describes various aspects of the projects including simulating the merging necessary to set up a national database, the cleaning of a database, its use in inter-library lending, and its application together with expert systems for the quality control of databases. The final project is BOPAC that has used modern technology to create faster and better access to a number of library catalogues worldwide and has demonstrated that authority control in its present form is not effective. }, author = {Ayres, F. H. and Ridley, J. M.}, doi = {10.1300/J104v44n01_08}, eprint = {http://www.tandfonline.com/doi/pdf/10.1300/J104v44n01_08}, interhash = {cca4fc8a3cfac69e11678be40da8de8f}, intrahash = {ac2f370d6267ed2d9e8e81ae4d709735}, journal = {Cataloging \& Classification Quarterly}, number = {1--2}, pages = {113--130}, title = {Twenty-Five Years of Bibliographic Control Research at the University of Bradford}, url = {http://www.tandfonline.com/doi/abs/10.1300/J104v44n01_08}, volume = 44, year = 2007 } @inproceedings{conf/nddl/JiangM01, author = {Jiang, Xiaoyi and Mojon, Daniel}, booktitle = {NDDL}, crossref = {conf/nddl/2001}, editor = {Isaías, Pedro T.}, interhash = {8b59b4f98b226ee20294ca7d56abe105}, intrahash = {b8efa49fddc744c36debf5a31b8142b4}, isbn = {972-98050-4-0}, pages = {79-88}, publisher = {ICEIS Press}, title = {Filtering Duplicate Publications in Bibliographic Databases.}, url = {http://dblp.uni-trier.de/db/conf/nddl/nddl2001.html#JiangM01}, year = 2001 } @inproceedings{Yang2006, author = {Yang, Hui and Callan, James P.}, booktitle = {SIGIR}, crossref = {conf/sigir/2006}, editor = {Efthimiadis, Efthimis N. and Dumais, Susan T. and Hawking, David and Järvelin, Kalervo}, ee = {http://doi.acm.org/10.1145/1148170.1148243}, interhash = {0703044e3abd1580680e66f2355813c6}, intrahash = {27e76ac1174db2a3ee4a3efd34bb2e16}, isbn = {1-59593-369-7}, pages = {421-428}, publisher = {ACM}, title = {Near-duplicate detection by instance-level constrained clustering.}, url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2006.html#YangC06}, year = 2006 } @misc{kapidakis2008duplicate, author = {Sitas, Anestis and Kapidakis, Sarantos}, interhash = {94c3f69a754778b492d725bb08ffc0fb}, intrahash = {633b89b5a6827d28513545282f9f8bc7}, journal = {Library Hi Tech}, pages = {pp. 287-301}, title = {Duplicate detection algorithms of bibliographic descriptions}, url = {http://www.ionio.gr/~sarantos/repository/j21J-LibraryHiTech-Sitas.pdf}, volume = {Vol. 26 Iss: 2}, year = 2008 } @inproceedings{Padmasree06, address = {Alexandria}, author = {Padmasree, Lam and Ambati, Vamshi and Chandulal, Jasthi Anand and Rao, Meda Sreenivasa}, booktitle = {Proceedings of the 2nd ICUDL}, editor = {Serageldin, Ismail and Reddy, Raj}, interhash = {06c68062346c3b1634c950a8bf96deec}, intrahash = {6369260b8ed58d9445b8d2df0a1864f4}, title = {Signature Based Duplicate Detection in Digital Libraries}, url = {http://www.ulib.org/conference/2006/25.pdf}, year = 2006 } @article{hirsch2005index, abstract = {I propose the index h, defined as the number of papers with citation number ≥h, as a useful index to characterize the scientific output of a researcher.}, author = {Hirsch, J. E.}, doi = {10.1073/pnas.0507655102}, eprint = {http://www.pnas.org/content/102/46/16569.full.pdf+html}, interhash = {e45cbc449d42e1841c704f121ec47f24}, intrahash = {7773c451332a1a0a25313461bee7e045}, journal = {Proceedings of the National Academy of Sciences of the United States of America}, number = 46, pages = {16569-16572}, title = {An index to quantify an individual's scientific research output}, url = {http://www.pnas.org/content/102/46/16569.abstract}, volume = 102, year = 2005 } @inproceedings{Brandes:2002:VBN:509740.509765, abstract = {We describe a novel approach to visualize bibliographic networks that facilitates the simultaneous identification of clusters (e.g., topic areas) and prominent entities (e.g., surveys or landmark papers). While employing the landscape metaphor proposed in several earlier works, we introduce new means to determine relevant parameters of the landscape. Moreover, we are able to compute prominent entities, clustering of entities, and the landscape's surface in a surprisingly simple and uniform way. The effectiveness of our network visualizations is illustrated on data from the graph drawing literature.}, acmid = {509765}, address = {Aire-la-Ville, Switzerland, Switzerland}, author = {Brandes, U. and Willhalm, T.}, booktitle = {Proceedings of the symposium on Data Visualisation 2002}, interhash = {7d070baa654fc70cb8a0b1e373d90e2a}, intrahash = {e5e72eed2d871523dc1100f060658a1c}, isbn = {1-58113-536-X}, location = {Barcelona, Spain}, pages = {159--ff}, publisher = {Eurographics Association}, series = {VISSYM '02}, title = {Visualization of bibliographic networks with a reshaped landscape metaphor}, url = {http://portal.acm.org/citation.cfm?id=509740.509765}, year = 2002 } @inproceedings{voss2009mapping, abstract = {This poster presents a set of hash keys for bibliographic records called bibkeys. Unlike other methods of duplicate detection, bibkeys can directly be calculated from a set of basic metadata fields (title, authors/editors, year). It is shown how bibkeys are used to map similar bibliographic records in BibSonomy and among distributed library catalogs and other distributed databases.}, author = {Voss, Jakob and Hotho, Andreas and Jäschke, Robert}, booktitle = {Information: Droge, Ware oder Commons?}, editor = {Kuhlen, Rainer}, interhash = {6e394e459d11dfa17f5d4cf1b8dd81c3}, intrahash = {01f6fe57f46e4b92fe02869341efdd8d}, organization = {Hochschulverband Informationswissenschaft}, publisher = {Verlag Werner Hülsbusch}, series = {Proceedings of the ISI}, title = {Mapping Bibliographic Records with Bibliographic Hash Keys}, url = {http://eprints.rclis.org/15953/}, year = 2009 } @misc{darcus2008bibliographic, abstract = {The Bibliographic Ontology Specification provides main concepts and properties for describing citations and bibliographic references (i.e. quotes, books, articles, etc) on the Semantic Web.}, author = {D'Arcus, Bruce and Giasson, Frédérick}, editor = {Giasson, Frédérick}, howpublished = {Specification Document}, interhash = {9a7903afc37b62c3bbeaebbf8023c5db}, intrahash = {209d061c6809463a86308a8091eace8f}, title = {Bibliographic Ontology Specification}, url = {http://bibliontology.com/}, year = 2008 } @incollection{Haase04Personalized, author = {Haase, Peter and Ehrig, Marc and Hotho, Andreas and Schnizler, Björn}, booktitle = {Peer-to-Peer and SemanticWeb, Decentralized Management and Exchange of Knowledge and Information}, editor = {Staab, Steffen and Stuckenschmidt, Heiner}, interhash = {a319301254d5f1ead9543d401957bd6a}, intrahash = {7e995de813d2ff340285a51ddeaf8892}, isbn = {3-540-28346-3}, pages = {143--158}, publisher = {Springer }, title = {Personalized Information Access in a Bibliographic Peer-to-Peer System}, vgwort = {29}, year = 2006 } @inproceedings{827147, abstract = {In this paper, we propose a method for extracting bibliographic attributes from reference strings captured using Optical Character Recognition (OCR) and an extended hidden Markov model. Bibliographic attribute extraction can be used in two ways. One is reference parsing in which attribute values are extracted from OCR-processed references for bibliographic matching. The other is reference alignment in which attribute values are aligned to the bibliographic record to enrich the vocabulary of the bibliographic database. In this paper, we first propose a statistical model for attribute extraction that represents both the syntactical structure of references and OCR error patterns. Then, we perform experiments using bibliographic references obtained from scanned images of papers in journals and transactions and show that useful attribute values are extracted from OCR-processed references. We also show that the proposed model has advantages in reducing the cost of preparing training data, a critical problem in rule-based systems.}, address = {Washington, DC, USA}, author = {Takasu, Atsuhiro}, booktitle = {JCDL '03: Proceedings of the 3rd ACM/IEEE-CS joint conference on Digital libraries}, interhash = {324c5995d000ceffa826eb2950dcd52e}, intrahash = {73b4dff8c6fac17b3ea377ed5b162540}, isbn = {0-7695-1939-3}, pages = {49--60}, publisher = {IEEE Computer Society}, title = {Bibliographic attribute extraction from erroneous references based on a statistical model}, year = 2003 } @inproceedings{pm04accurate, author = {Peng, Fuchun and McCallum, Andrew}, booktitle = {HLT-NAACL}, ee = {http://acl.ldc.upenn.edu/hlt-naacl2004/main/pdf/176_Paper.pdf}, interhash = {8f9ef6b359fef3bd08bfed653fe1bb55}, intrahash = {8d04bc19e470fe4b98e15a27a1e6e7e9}, pages = {329-336}, title = {Accurate Information Extraction from Research Papers using Conditional Random Fields}, url = {http://www.cs.umass.edu/~mccallum/papers/hlt2004.pdf}, year = 2004 } @inproceedings{haase04personalized, author = {Haase, Peter and Ehrig, Marc and Hotho, Andreas and Schnizler, Bj{\"{o}}rn}, booktitle = {Proceedings of the AAAI Workshop on Semantic Web Personalization, 2004}, interhash = {7f0bd5b9167c6f832d8efeddfecef0f8}, intrahash = {b98d9b4c35eba302515f1088470de222}, month = JUL, pages = {1-12}, publisher = {AAAI Press}, title = {Personalized Information Access in a Bibliographic Peer-to-Peer System}, url = {http://www.kde.cs.uni-kassel.de/hotho/pub/2004/haase04person.pdf}, year = 2004 }