@proceedings{thierrypoibeau2013multisource, abstract = {Information extraction (IE) and text summarization (TS) are powerful technologies for finding relevant pieces of information in text and presenting them to the user in condensed form. The ongoing information explosion makes IE and TS critical for successful functioning within the information society. These technologies face particular challenges due to the inherent multi-source nature of the information explosion. The technologies must now handle not isolated texts or individual narratives, but rather large-scale repositories and streams--in general, in multiple languages--containing a multiplicity of perspectives, opinions, or commentaries on particular topics, entities or events. There is thus a need to adapt existing techniques and develop new ones to deal with these challenges. This volume contains a selection of papers that present a variety of methodologies for content identification and extraction, as well as for content fusion and regeneration. The chapters cover various aspects of the challenges, depending on the nature of the information sought--names vs. events,-- and the nature of the sources--news streams vs. image captions vs. scientific research papers, etc. This volume aims to offer a broad and representative sample of studies from this very active research field.}, address = {Berlin; New York}, editor = {Poibeau, Thierry and Saggion, Horacio and Piskorski, Jakub and Yangarber, Roman}, interhash = {b1d51398d5660ed1e16f40d74cc815db}, intrahash = {21816f2809a2b58397acce5ac9558d28}, isbn = {9783642285691 3642285694 3642285686 9783642285684}, publisher = {Springer}, refid = {808368416}, title = {Multi-source, multilingual information extraction and summarization}, url = {http://link.springer.com/book/10.1007/978-3-642-28569-1}, year = 2013 } @incollection{piskorski2013information, abstract = {In this chapter we present a brief overview of Information Extraction, which is an area of natural language processing that deals with finding factual information in free text. In formal terms, }, author = {Piskorski, Jakub and Yangarber, Roman}, booktitle = {Multi-source, Multilingual Information Extraction and Summarization}, doi = {10.1007/978-3-642-28569-1_2}, editor = {Poibeau, Thierry and Saggion, Horacio and Piskorski, Jakub and Yangarber, Roman}, interhash = {276145faeb3b45461f09f6ae5aabef5e}, intrahash = {55c1de993e15515d35b68a512088d607}, isbn = {978-3-642-28568-4}, language = {English}, pages = {23-49}, publisher = {Springer Berlin Heidelberg}, series = {Theory and Applications of Natural Language Processing}, title = {Information Extraction: Past, Present and Future}, url = {http://dx.doi.org/10.1007/978-3-642-28569-1_2}, year = 2013 } @inproceedings{chrupala2010named, author = {Chrupala, Grzegorz and Klakow, Dietrich}, booktitle = {LREC}, crossref = {conf/lrec/2010}, editor = {Calzolari, Nicoletta and Choukri, Khalid and Maegaard, Bente and Mariani, Joseph and Odijk, Jan and Piperidis, Stelios and Rosner, Mike and Tapias, Daniel}, ee = {http://www.lrec-conf.org/proceedings/lrec2010/summaries/538.html}, interhash = {85b8f5e04b66df3fe9411fc8f81ae43a}, intrahash = {68b98f37dc2dd0a89f580d9e6b65c780}, isbn = {2-9517408-6-7}, publisher = {European Language Resources Association}, title = {A Named Entity Labeler for German: Exploiting Wikipedia and Distributional Clusters.}, url = {http://lexitron.nectec.or.th/public/LREC-2010_Malta/pdf/538_Paper.pdf}, year = 2010 } @inproceedings{conf/pkdd/KluglTLHP12, author = {Klügl, Peter and Toepfer, Martin and Lemmerich, Florian and Hotho, Andreas and Puppe, Frank}, booktitle = {ECML/PKDD (1)}, crossref = {conf/pkdd/2012-1}, editor = {Flach, Peter A. and Bie, Tijl De and Cristianini, Nello}, ee = {http://dx.doi.org/10.1007/978-3-642-33460-3_52}, interhash = {ccd3a716939562b7e91ecb057ae7df2d}, intrahash = {afd38525dbe0f52db7389e03aa7df1f7}, isbn = {978-3-642-33459-7}, pages = {728-743}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Collective Information Extraction with Context-Specific Consistencies.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2012-1.html#KluglTLHP12}, volume = 7523, year = 2012 } @article{ley2009lessons, abstract = {The DBLP Computer Science Bibliography evolved from an early small experimental Web server to a popular service for the computer science community. Many design decisions and details of the public XML-records behind DBLP never were documented. This paper is a review of the evolution of DBLP. The main perspective is data modeling. In DBLP persons play a central role, our discussion of person names may be applicable to many other data bases. All DBLP data are available for your own experiments. You may either download the complete set, or use a simple XML-based API described in an online appendix.}, acmid = {1687577}, author = {Ley, Michael}, interhash = {a75ae2987d55512b7d0731c7a11a1722}, intrahash = {bb968ff4ba9ae93bc80ba05d16a98ff4}, issn = {2150-8097}, issue_date = {August 2009}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 2, numpages = {8}, pages = {1493--1500}, publisher = {VLDB Endowment}, title = {DBLP: some lessons learned}, url = {http://dl.acm.org/citation.cfm?id=1687553.1687577}, volume = 2, year = 2009 } @article{tejada2001learning, abstract = {When integrating information from multiple websites, the same data objects can exist in inconsistent text formats across sites, making it difficult to identify matching objects using exact text match. We have developed an object identification system called Active Atlas, which compares the objects’ shared attributes in order to identify matching objects. Certain attributes are more important for deciding if a mapping should exist between two objects. Previous methods of object identification have required manual construction of object identification rules or mapping rules for determining the mappings between objects. This manual process is time consuming and error-prone. In our approach. Active Atlas learns to tailor mapping rules, through limited user input, to a specific application domain. The experimental results demonstrate that we achieve higher accuracy and require less user involvement than previous methods across various application domains.}, author = {Tejada, Sheila and Knoblock, Craig A and Minton, Steven}, doi = {10.1016/S0306-4379(01)00042-4}, interhash = {f9f59187b0397a0fbe1e558dfb4ad9cf}, intrahash = {5ad46801d602408ce271276f452263a9}, issn = {0306-4379}, journal = {Information Systems}, month = dec, number = 8, pages = {607--633}, title = {Learning object identification rules for information integration}, url = {http://www.sciencedirect.com/science/article/pii/S0306437901000424}, volume = 26, year = 2001 } @inproceedings{lafferty2001conditional, acmid = {655813}, address = {San Francisco, CA, USA}, author = {Lafferty, John D. and McCallum, Andrew and Pereira, Fernando C. N.}, booktitle = {Proceedings of the Eighteenth International Conference on Machine Learning}, interhash = {574c59001ecc3aa04850e1751d96c137}, intrahash = {180c5d6097317fa1b19ca8df75341230}, isbn = {1-55860-778-1}, numpages = {8}, pages = {282--289}, publisher = {Morgan Kaufmann Publishers Inc.}, title = {Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data}, url = {http://dl.acm.org/citation.cfm?id=645530.655813}, year = 2001 } @inproceedings{granitzer2012comparison, abstract = {Social research networks such as Mendeley and CiteULike offer various services for collaboratively managing bibliographic metadata. Compared with traditional libraries, metadata quality is of crucial importance in order to create a crowdsourced bibliographic catalog for search and browsing. Artifacts, in particular PDFs which are managed by the users of the social research networks, become one important metadata source and the starting point for creating a homogeneous, high quality, bibliographic catalog. Natural Language Processing and Information Extraction techniques have been employed to extract structured information from unstructured sources. However, given highly heterogeneous artifacts that cover a range of publication styles, stemming from different publication sources, and imperfect PDF processing tools, how accurate are metadata extraction methods in such real-world settings? This paper focuses on answering that question by investigating the use of Conditional Random Fields and Support Vector Machines on real-world data gathered from Mendeley and Linked-Data repositories. We compare style and content features on existing state-of-the-art methods on two newly created real-world data sets for metadata extraction. Our analysis shows that two-stage SVMs provide reasonable performance in solving the challenge of metadata extraction for crowdsourcing bibliographic metadata management.}, acmid = {2254154}, address = {New York, NY, USA}, articleno = {19}, author = {Granitzer, Michael and Hristakeva, Maya and Knight, Robert and Jack, Kris and Kern, Roman}, booktitle = {Proceedings of the 2nd International Conference on Web Intelligence, Mining and Semantics}, doi = {10.1145/2254129.2254154}, interhash = {bfa622b68be4bb039ca0516b3b33ec40}, intrahash = {7194c862da359af9aa18b4d865cbce55}, isbn = {978-1-4503-0915-8}, location = {Craiova, Romania}, numpages = {8}, pages = {19:1--19:8}, publisher = {ACM}, title = {A comparison of layout based bibliographic metadata extraction techniques}, url = {http://doi.acm.org/10.1145/2254129.2254154}, year = 2012 } @inproceedings{kristjansson2004interactive, abstract = {Information Extraction methods can be used to automatically "fill-in" database forms from unstructured data such as Web documents or email. State-of-the-art methods have achieved low error rates but invariably make a number of errors. The goal of an interactive information extraction system is to assist the user in filling in database fields while giving the user confidence in the integrity of the data. The user is presented with an interactive interface that allows both the rapid verification of automatic field assignments and the correction of errors. In cases where there are multiple errors, our system takes into account user corrections, and immediately propagates these constraints such that other fields are often corrected automatically. Linear-chain conditional random fields (CRFs) have been shown to perform well for information extraction and other language modelling tasks due to their ability to capture arbitrary, overlapping features of the input in aMarkov model. We apply this framework with two extensions: a constrained Viterbi decoding which finds the optimal field assignments consistent with the fields explicitly specified or corrected by the user; and a mechanism for estimating the confidence of each extracted field, so that low-confidence extractions can be highlighted. Both of these mechanisms are incorporated in a novel user interface for form filling that is intuitive and speeds the entry of data—providing a 23% reduction in error due to automated corrections.}, author = {Kristjansson, Trausti T. and Culotta, Aron and Viola, Paul A. and McCallum, Andrew}, booktitle = {AAAI}, editor = {McGuinness, Deborah L. and Ferguson, George}, interhash = {89fe7fe6ef4c088b10d3b0b0aabeaf46}, intrahash = {fe6cb1dbef3216852a63a625a30799d6}, isbn = {0-262-51183-5}, pages = {412--418}, publisher = {AAAI Press/The MIT Press}, title = {Interactive Information Extraction with Constrained Conditional Random Fields.}, url = {http://dblp.uni-trier.de/db/conf/aaai/aaai2004.html#KristjanssonCVM04}, year = 2004 } @article{raykar2010learning, abstract = {For many supervised learning tasks it may be infeasible (or very expensive) to obtain objective and reliable labels. Instead, we can collect subjective (possibly noisy) labels from multiple experts or annotators. In practice, there is a substantial amount of disagreement among the annotators, and hence it is of great practical interest to address conventional supervised learning problems in this scenario. In this paper we describe a probabilistic approach for supervised learning when we have multiple annotators providing (possibly noisy) labels but no absolute gold standard. The proposed algorithm evaluates the different experts and also gives an estimate of the actual hidden labels. Experimental results indicate that the proposed method is superior to the commonly used majority voting baseline.}, acmid = {1859894}, author = {Raykar, Vikas C. and Yu, Shipeng and Zhao, Linda H. and Valadez, Gerardo Hermosillo and Florin, Charles and Bogoni, Luca and Moy, Linda}, interhash = {8113daf47997fddf48e4c6c79f2eba56}, intrahash = {14220abe8babfab01c0cdd5ebd5e4b7c}, issn = {1532-4435}, issue_date = {3/1/2010}, journal = {Journal of Machine Learning Research}, month = aug, numpages = {26}, pages = {1297--1322}, publisher = {JMLR.org}, title = {Learning From Crowds}, url = {http://dl.acm.org/citation.cfm?id=1756006.1859894}, volume = 11, year = 2010 } @article{balke2012introduction, abstract = {Transforming unstructured or semi-structured information into structured knowledge is one of the big challenges of today’s knowledge society. While this abstract goal is still unreached and probably unreachable, intelligent information extraction techniques are considered key ingredients on the way to generating and representing knowledge for a wide variety of applications. This is especially true for the current efforts to turn the World Wide Web being the world’s largest collection of information into the world’s largest knowledge base. This introduction gives a broad overview about the major topics and current trends in information extraction.}, address = {Berlin/Heidelberg}, affiliation = {Institut für Informationssysteme, Technische Universität Braunschweig, Braunschweig, Germany}, author = {Balke, Wolf-Tilo}, doi = {10.1007/s13222-012-0090-x}, interhash = {0127ba6c59c3f7f7121429eb098a4b90}, intrahash = {992b3c989c8fda7c58cd9262e2f70907}, issn = {1618-2162}, journal = {Datenbank-Spektrum}, keyword = {Computer Science}, number = 2, pages = {81--88}, publisher = {Springer}, title = {Introduction to Information Extraction: Basic Notions and Current Trends}, url = {http://dx.doi.org/10.1007/s13222-012-0090-x}, volume = 12, year = 2012 } @incollection{li2011incorporating, abstract = {In scientific cooperation network, ambiguous author names may occur due to the existence of multiple authors with the same name. Users of these networks usually want to know the exact author of a paper, whereas we do not have any unique identifier to distinguish them. In this paper, we focus ourselves on such problem, we propose a new method that incorporates user feedback into the model for name disambiguation of scientific cooperation network. Perceptron is used as the classifier. Two features and a constraint drawn from user feedback are incorporated into the perceptron to enhance the performance of name disambiguation. Specifically, we construct user feedback as a training stream, and refine the perceptron continuously. Experimental results show that the proposed algorithm can learn continuously and significantly outperforms the previous methods without introducing user interactions.}, address = {Berlin/Heidelberg}, affiliation = {Intelligent and Distributed Computing Lab, School of Computer Science and Technology, Huazhong University of Science and Technology, Wuhan, 430074 P.R. China}, author = {Li, Yuhua and Wen, Aiming and Lin, Quan and Li, Ruixuan and Lu, Zhengding}, booktitle = {Web-Age Information Management}, doi = {10.1007/978-3-642-23535-1_39}, editor = {Wang, Haixun and Li, Shijun and Oyama, Satoshi and Hu, Xiaohua and Qian, Tieyun}, interhash = {3baace12cb4481dcceb53c2d47f413b5}, intrahash = {96f2ae8551126527c2dfe69c8fa22f6c}, isbn = {978-3-642-23534-4}, keyword = {Computer Science}, pages = {454--466}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Incorporating User Feedback into Name Disambiguation of Scientific Cooperation Network}, url = {http://dx.doi.org/10.1007/978-3-642-23535-1_39}, volume = 6897, year = 2011 } @article{lofi2012information, abstract = {Recent years brought tremendous advancements in the area of automated information extraction. But still, problem scenarios remain where even state-of-the-art algorithms do not provide a satisfying solution. In these cases, another aspiring recent trend can be exploited to achieve the required extraction quality: explicit crowdsourcing of human intelligence tasks. In this paper, we discuss the synergies between information extraction and crowdsourcing. In particular, we methodically identify and classify the challenges and fallacies that arise when combining both approaches. Furthermore, we argue that for harnessing the full potential of either approach, true hybrid techniques must be considered. To demonstrate this point, we showcase such a hybrid technique, which tightly interweaves information extraction with crowdsourcing and machine learning to vastly surpass the abilities of either technique.}, address = {Berlin/Heidelberg}, affiliation = {Institut für Informationssysteme, Technische Universität Braunschweig, Braunschweig, Germany}, author = {Lofi, Christoph and Selke, Joachim and Balke, Wolf-Tilo}, doi = {10.1007/s13222-012-0092-8}, interhash = {941feeaa7bb134e0a5f8b5c0225756b8}, intrahash = {37cc8f1d19105a073544d6594fbbc033}, issn = {1618-2162}, journal = {Datenbank-Spektrum}, keyword = {Computer Science}, number = 2, pages = {109--120}, publisher = {Springer}, title = {Information Extraction Meets Crowdsourcing: A Promising Couple}, url = {http://dx.doi.org/10.1007/s13222-012-0092-8}, volume = 12, year = 2012 } @inproceedings{paton2011feedback, abstract = {User feedback is gaining momentum as a means of addressing the difficulties underlying information integration tasks. It can be used to assist users in building information integration systems and to improve the quality of existing systems, e.g., in dataspaces. Existing proposals in the area are confined to specific integration sub-problems considering a specific kind of feedback sought, in most cases, from a single user. We argue in this paper that, in order to maximize the benefits that can be drawn from user feedback, it should be considered and managed as a first class citizen. Accordingly, we present generic operations that underpin the management of feedback within information integration systems, and that are applicable to feedback of different kinds, potentially supplied by multiple users with different expectations. We present preliminary solutions that can be adopted for realizing such operations, and sketch a research agenda for the information integration community.}, author = {Paton, Norman W. and Fernandes, Alvaro A. A. and Hedeler, Cornelia and Embury, Suzanne M.}, booktitle = {Proceedings of the Conference on Innovative Data Systems Research (CIDR)}, interhash = {1874e5c09919244808457021d2d884d1}, intrahash = {cd75210156615616e4f25c91143040c4}, pages = {175--183}, title = {User Feedback as a First Class Citizen in Information Integration Systems}, url = {http://www.cidrdb.org/cidr2011/Papers/CIDR11_Paper21.pdf}, year = 2011 } @article{doan2009information, abstract = {Over the past few years, we have been trying to build an end-to-end system at Wisconsin to manage unstructured data, using extraction, integration, and user interaction. This paper describes the key information extraction (IE) challenges that we have run into, and sketches our solutions. We discuss in particular developing a declarative IE language, optimizing for this language, generating IE provenance, incorporating user feedback into the IE process, developing a novel wiki-based user interface for feedback, best-effort IE, pushing IE into RDBMSs, and more. Our work suggests that IE in managing unstructured data can open up many interesting research challenges, and that these challenges can greatly benefit from the wealth of work on managing structured data that has been carried out by the database community.}, acmid = {1519106}, address = {New York, NY, USA}, author = {Doan, AnHai and Naughton, Jeffrey F. and Ramakrishnan, Raghu and Baid, Akanksha and Chai, Xiaoyong and Chen, Fei and Chen, Ting and Chu, Eric and DeRose, Pedro and Gao, Byron and Gokhale, Chaitanya and Huang, Jiansheng and Shen, Warren and Vuong, Ba-Quy}, doi = {10.1145/1519103.1519106}, interhash = {b80d6ce47b976503692def4e86b0097d}, intrahash = {fccc9f25a1c70cb71d3377a7ddfe1614}, issn = {0163-5808}, issue_date = {December 2008}, journal = {SIGMOD Record}, month = mar, number = 4, numpages = {7}, pages = {14--20}, publisher = {ACM}, title = {Information extraction challenges in managing unstructured data}, url = {http://doi.acm.org/10.1145/1519103.1519106}, volume = 37, year = 2009 } @inproceedings{chai2009efficiently, abstract = {Many applications increasingly employ information extraction and integration (IE/II) programs to infer structures from unstructured data. Automatic IE/II are inherently imprecise. Hence such programs often make many IE/II mistakes, and thus can significantly benefit from user feedback. Today, however, there is no good way to automatically provide and process such feedback. When finding an IE/II mistake, users often must alert the developer team (e.g., via email or Web form) about the mistake, and then wait for the team to manually examine the program internals to locate and fix the mistake, a slow, error-prone, and frustrating process.

In this paper we propose a solution for users to directly provide feedback and for IE/II programs to automatically process such feedback. In our solution a developer U uses hlog, a declarative IE/II language, to write an IE/II program P. Next, U writes declarative user feedback rules that specify which parts of P's data (e.g., input, intermediate, or output data) users can edit, and via which user interfaces. Next, the so-augmented program P is executed, then enters a loop of waiting for and incorporating user feedback. Given user feedback F on a data portion of P, we show how to automatically propagate F to the rest of P, and to seamlessly combine F with prior user feedback. We describe the syntax and semantics of hlog, a baseline execution strategy, and then various optimization techniques. Finally, we describe experiments with real-world data that demonstrate the promise of our solution.}, acmid = {1559857}, address = {New York, NY, USA}, author = {Chai, Xiaoyong and Vuong, Ba-Quy and Doan, AnHai and Naughton, Jeffrey F.}, booktitle = {Proceedings of the 35th SIGMOD international conference on Management of data}, doi = {10.1145/1559845.1559857}, interhash = {5860215447e374b059597c0e3864e388}, intrahash = {d6c9fbf442a935dc0618107f8fb54d44}, isbn = {978-1-60558-551-2}, location = {Providence, Rhode Island, USA}, numpages = {14}, pages = {87--100}, publisher = {ACM}, title = {Efficiently incorporating user feedback into information extraction and integration programs}, url = {http://doi.acm.org/10.1145/1559845.1559857}, year = 2009 } @inproceedings{AB:10, author = {Atzmueller, Martin and Beer, Stephanie}, booktitle = {Proc. 55th IWK, International Workshop on Design, Evaluation and Refinement of Intelligent Systems (DERIS)}, interhash = {7e80a6b45a723165b02d8e33581da64e}, intrahash = {a735dbe20e7e04c577c3eb4e67ebede2}, publisher = {University of Ilmenau}, title = {Validation of Mixed-Structured Data Using Pattern Mining and Information Extraction}, year = 2010 } @inproceedings{kdml21, abstract = {The accurate extraction of bibliographic information from scientific publications is an active field of research. Machine learning and sequence labeling approaches like Conditional Random Fields (CRF) are often applied for this reference extraction task, but still suffer from the ambiguity of reference notation. Reference sections apply a predefined style guide and contain only homogeneous references. Therefore, other references of the same paper or journal often provide evidence how the fields of a reference are correctly labeled. We propose a novel approach that exploits the similarities within a document. Our process model uses information of unlabeled documents directly during the extraction task in order to automatically adapt to the perceived style guide. This is implemented by changing the manifestation of the features for the applied CRF. The experimental results show considerable improvements compared to the common approach. We achieve an average F1 score of 96.7% and an instance accuracy of 85.4% on the test data set.}, address = {Kassel, Germany}, author = {Toepfer, Martin and Kluegl, Peter and Hotho, Andreas and Puppe., Frank}, booktitle = {Proceedings of LWA2010 - Workshop-Woche: Lernen, Wissen {\&} Adaptivitaet}, crossref = {lwa2010}, editor = {Atzmüller, Martin and Benz, Dominik and Hotho, Andreas and Stumme, Gerd}, interhash = {d8f45281363701bfe7f979b1e13ee269}, intrahash = {37242cd584805b2e4cea0c486008889d}, presentation_end = {2010-10-05 16:45:00}, presentation_start = {2010-10-05 16:22:30}, room = {0446}, session = {kdml2}, title = {Conditional Random Fields For Local Adaptive Reference Extraction}, track = {kdml}, url = {http://www.kde.cs.uni-kassel.de/conf/lwa10/papers/kdml21.pdf}, year = 2010 } @inproceedings{conf/semweb/TangHLL06, author = {Tang, Jie and Hong, MingCai and Li, Juan-Zi and Liang, Bangyong}, booktitle = {International Semantic Web Conference}, crossref = {conf/semweb/2006}, date = {2006-11-09}, editor = {Cruz, Isabel F. and Decker, Stefan and Allemang, Dean and Preist, Chris and Schwabe, Daniel and Mika, Peter and Uschold, Michael and Aroyo, Lora}, ee = {http://dx.doi.org/10.1007/11926078_46}, interhash = {0cd79ca123126fe66d0e2f2888222c79}, intrahash = {e378a25116a480b55e64a919a351f1a7}, isbn = {3-540-49029-9}, pages = {640-653}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Tree-Structured Conditional Random Fields for Semantic Annotation.}, url = {http://dblp.uni-trier.de/db/conf/semweb/iswc2006.html#TangHLL06}, volume = 4273, year = 2006 } @inproceedings{2009:KI:KAP:MetaLevel, address = {Berlin}, author = {Kluegl, Peter and Atzmueller, Martin and Puppe, Frank}, booktitle = {The 32nd Annual Conference on Artificial Intelligence}, interhash = {9cc3c48a3f664aa143bf55475c2a3dcd}, intrahash = {0961873e89b97b18298bc922b6706bae}, month = {September}, note = {(accepted)}, publisher = {Springer}, title = {Meta-Level Information Extraction}, year = 2009 }