@article{doan2009information, abstract = {Over the past few years, we have been trying to build an end-to-end system at Wisconsin to manage unstructured data, using extraction, integration, and user interaction. This paper describes the key information extraction (IE) challenges that we have run into, and sketches our solutions. We discuss in particular developing a declarative IE language, optimizing for this language, generating IE provenance, incorporating user feedback into the IE process, developing a novel wiki-based user interface for feedback, best-effort IE, pushing IE into RDBMSs, and more. Our work suggests that IE in managing unstructured data can open up many interesting research challenges, and that these challenges can greatly benefit from the wealth of work on managing structured data that has been carried out by the database community.}, acmid = {1519106}, address = {New York, NY, USA}, author = {Doan, AnHai and Naughton, Jeffrey F. and Ramakrishnan, Raghu and Baid, Akanksha and Chai, Xiaoyong and Chen, Fei and Chen, Ting and Chu, Eric and DeRose, Pedro and Gao, Byron and Gokhale, Chaitanya and Huang, Jiansheng and Shen, Warren and Vuong, Ba-Quy}, doi = {10.1145/1519103.1519106}, interhash = {b80d6ce47b976503692def4e86b0097d}, intrahash = {fccc9f25a1c70cb71d3377a7ddfe1614}, issn = {0163-5808}, issue_date = {December 2008}, journal = {SIGMOD Record}, month = mar, number = 4, numpages = {7}, pages = {14--20}, publisher = {ACM}, title = {Information extraction challenges in managing unstructured data}, url = {http://doi.acm.org/10.1145/1519103.1519106}, volume = 37, year = 2009 } @inproceedings{chai2009efficiently, abstract = {Many applications increasingly employ information extraction and integration (IE/II) programs to infer structures from unstructured data. Automatic IE/II are inherently imprecise. Hence such programs often make many IE/II mistakes, and thus can significantly benefit from user feedback. Today, however, there is no good way to automatically provide and process such feedback. When finding an IE/II mistake, users often must alert the developer team (e.g., via email or Web form) about the mistake, and then wait for the team to manually examine the program internals to locate and fix the mistake, a slow, error-prone, and frustrating process.

In this paper we propose a solution for users to directly provide feedback and for IE/II programs to automatically process such feedback. In our solution a developer U uses hlog, a declarative IE/II language, to write an IE/II program P. Next, U writes declarative user feedback rules that specify which parts of P's data (e.g., input, intermediate, or output data) users can edit, and via which user interfaces. Next, the so-augmented program P is executed, then enters a loop of waiting for and incorporating user feedback. Given user feedback F on a data portion of P, we show how to automatically propagate F to the rest of P, and to seamlessly combine F with prior user feedback. We describe the syntax and semantics of hlog, a baseline execution strategy, and then various optimization techniques. Finally, we describe experiments with real-world data that demonstrate the promise of our solution.}, acmid = {1559857}, address = {New York, NY, USA}, author = {Chai, Xiaoyong and Vuong, Ba-Quy and Doan, AnHai and Naughton, Jeffrey F.}, booktitle = {Proceedings of the 35th SIGMOD international conference on Management of data}, doi = {10.1145/1559845.1559857}, interhash = {5860215447e374b059597c0e3864e388}, intrahash = {d6c9fbf442a935dc0618107f8fb54d44}, isbn = {978-1-60558-551-2}, location = {Providence, Rhode Island, USA}, numpages = {14}, pages = {87--100}, publisher = {ACM}, title = {Efficiently incorporating user feedback into information extraction and integration programs}, url = {http://doi.acm.org/10.1145/1559845.1559857}, year = 2009 } @inproceedings{hu2007measuring, abstract = {Wikipedia has grown to be the world largest and busiest free encyclopedia, in which articles are collaboratively written and maintained by volunteers online. Despite its success as a means of knowledge sharing and collaboration, the public has never stopped criticizing the quality of Wikipedia articles edited by non-experts and inexperienced contributors. In this paper, we investigate the problem of assessing the quality of articles in collaborative authoring of Wikipedia. We propose three article quality measurement models that make use of the interaction data between articles and their contributors derived from the article edit history. Our B<scp>asic</scp> model is designed based on the mutual dependency between article quality and their author authority. The P<scp>eer</scp>R<scp>eview</scp> model introduces the review behavior into measuring article quality. Finally, our P<scp>rob</scp>R<scp>eview</scp> models extend P<scp>eer</scp>R<scp>eview</scp> with partial reviewership of contributors as they edit various portions of the articles. We conduct experiments on a set of well-labeled Wikipedia articles to evaluate the effectiveness of our quality measurement models in resembling human judgement.}, acmid = {1321476}, address = {New York, NY, USA}, author = {Hu, Meiqun and Lim, Ee-Peng and Sun, Aixin and Lauw, Hady Wirawan and Vuong, Ba-Quy}, booktitle = {Proceedings of the Sixteenth ACM Conference on Conference on Information and Knowledge Management}, doi = {10.1145/1321440.1321476}, interhash = {7fceff7d0b5943b21f66d970cfd65ccb}, intrahash = {cd9077443f7519e9cdce492858753632}, isbn = {978-1-59593-803-9}, location = {Lisbon, Portugal}, numpages = {10}, pages = {243--252}, publisher = {ACM}, series = {CIKM '07}, title = {Measuring article quality in wikipedia: models and evaluation}, url = {http://doi.acm.org/10.1145/1321440.1321476}, year = 2007 }