@inproceedings{derose2008building, abstract = {The rapid growth of Web communities has motivated many solutions for building community data portals. These solutions follow roughly two approaches. The first approach (e.g., Libra, Citeseer, Cimple) employs semi-automatic methods to extract and integrate data from a multitude of data sources. The second approach (e.g., Wikipedia, Intellipedia) deploys an initial portal in wiki format, then invites community members to revise and add material. In this paper we consider combining the above two approaches to building community portals. The new hybrid machine-human approach brings significant benefits. It can achieve broader and deeper coverage, provide more incentives for users to contribute, and keep the portal more up-to-date with less user effort. In a sense, it enables building "community wikipedias", backed by an underlying structured database that is continuously updated using automatic techniques. We outline our ideas for the new approach, describe its challenges and opportunities, and provide initial solutions. Finally, we describe a real-world implementation and preliminary experiments that demonstrate the utility of the new approach.}, author = {DeRose, P. and Chai, Xiaoyong and Gao, B.J. and Shen, W. and Doan, An Hai and Bohannon, P. and Zhu, Xiaojin}, booktitle = {24th International Conference on Data Engineering}, doi = {10.1109/ICDE.2008.4497473}, interhash = {00f45357225b1e75ed93bddb8d456fb7}, intrahash = {38a2e84d3dfd845d9c260d5f15161c6f}, month = apr, pages = {646--655}, publisher = {IEEE}, title = {Building Community Wikipedias: A Machine-Human Partnership Approach}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4497473&tag=1}, year = 2008 } @article{doan2009information, abstract = {Over the past few years, we have been trying to build an end-to-end system at Wisconsin to manage unstructured data, using extraction, integration, and user interaction. This paper describes the key information extraction (IE) challenges that we have run into, and sketches our solutions. We discuss in particular developing a declarative IE language, optimizing for this language, generating IE provenance, incorporating user feedback into the IE process, developing a novel wiki-based user interface for feedback, best-effort IE, pushing IE into RDBMSs, and more. Our work suggests that IE in managing unstructured data can open up many interesting research challenges, and that these challenges can greatly benefit from the wealth of work on managing structured data that has been carried out by the database community.}, acmid = {1519106}, address = {New York, NY, USA}, author = {Doan, AnHai and Naughton, Jeffrey F. and Ramakrishnan, Raghu and Baid, Akanksha and Chai, Xiaoyong and Chen, Fei and Chen, Ting and Chu, Eric and DeRose, Pedro and Gao, Byron and Gokhale, Chaitanya and Huang, Jiansheng and Shen, Warren and Vuong, Ba-Quy}, doi = {10.1145/1519103.1519106}, interhash = {b80d6ce47b976503692def4e86b0097d}, intrahash = {fccc9f25a1c70cb71d3377a7ddfe1614}, issn = {0163-5808}, issue_date = {December 2008}, journal = {SIGMOD Record}, month = mar, number = 4, numpages = {7}, pages = {14--20}, publisher = {ACM}, title = {Information extraction challenges in managing unstructured data}, url = {http://doi.acm.org/10.1145/1519103.1519106}, volume = 37, year = 2009 } @inproceedings{chai2009efficiently, abstract = {Many applications increasingly employ information extraction and integration (IE/II) programs to infer structures from unstructured data. Automatic IE/II are inherently imprecise. Hence such programs often make many IE/II mistakes, and thus can significantly benefit from user feedback. Today, however, there is no good way to automatically provide and process such feedback. When finding an IE/II mistake, users often must alert the developer team (e.g., via email or Web form) about the mistake, and then wait for the team to manually examine the program internals to locate and fix the mistake, a slow, error-prone, and frustrating process.

In this paper we propose a solution for users to directly provide feedback and for IE/II programs to automatically process such feedback. In our solution a developer U uses hlog, a declarative IE/II language, to write an IE/II program P. Next, U writes declarative user feedback rules that specify which parts of P's data (e.g., input, intermediate, or output data) users can edit, and via which user interfaces. Next, the so-augmented program P is executed, then enters a loop of waiting for and incorporating user feedback. Given user feedback F on a data portion of P, we show how to automatically propagate F to the rest of P, and to seamlessly combine F with prior user feedback. We describe the syntax and semantics of hlog, a baseline execution strategy, and then various optimization techniques. Finally, we describe experiments with real-world data that demonstrate the promise of our solution.}, acmid = {1559857}, address = {New York, NY, USA}, author = {Chai, Xiaoyong and Vuong, Ba-Quy and Doan, AnHai and Naughton, Jeffrey F.}, booktitle = {Proceedings of the 35th SIGMOD international conference on Management of data}, doi = {10.1145/1559845.1559857}, interhash = {5860215447e374b059597c0e3864e388}, intrahash = {d6c9fbf442a935dc0618107f8fb54d44}, isbn = {978-1-60558-551-2}, location = {Providence, Rhode Island, USA}, numpages = {14}, pages = {87--100}, publisher = {ACM}, title = {Efficiently incorporating user feedback into information extraction and integration programs}, url = {http://doi.acm.org/10.1145/1559845.1559857}, year = 2009 }