@inproceedings{conf/icdt/BeyerGRS99, author = {Beyer, Kevin S. and Goldstein, Jonathan and Ramakrishnan, Raghu and Shaft, Uri}, booktitle = {ICDT}, cite = {conf/icde/WhiteJ96}, crossref = {conf/icdt/99}, date = {2002-01-03}, editor = {Beeri, Catriel and Buneman, Peter}, ee = {http://link.springer.de/link/service/series/0558/bibs/1540/15400217.htm}, interhash = {17f2a2126af823b1b135231d1c189e7d}, intrahash = {b0beff3a9fa219560f51295a27a3fc5a}, isbn = {3-540-65452-6}, pages = {217-235}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {When Is ''Nearest Neighbor'' Meaningful?}, url = {http://dblp.uni-trier.de/db/conf/icdt/icdt99.html#BeyerGRS99}, volume = 1540, year = 1999 } @inproceedings{conf/icdt/BeyerGRS99, author = {Beyer, Kevin S. and Goldstein, Jonathan and Ramakrishnan, Raghu and Shaft, Uri}, booktitle = {ICDT}, cite = {conf/icde/WhiteJ96}, crossref = {conf/icdt/99}, date = {2002-01-03}, editor = {Beeri, Catriel and Buneman, Peter}, ee = {http://link.springer.de/link/service/series/0558/bibs/1540/15400217.htm}, interhash = {17f2a2126af823b1b135231d1c189e7d}, intrahash = {b0beff3a9fa219560f51295a27a3fc5a}, isbn = {3-540-65452-6}, pages = {217-235}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {When Is ''Nearest Neighbor'' Meaningful?}, url = {http://dblp.uni-trier.de/db/conf/icdt/icdt99.html#BeyerGRS99}, volume = 1540, year = 1999 } @article{doan2009information, abstract = {Over the past few years, we have been trying to build an end-to-end system at Wisconsin to manage unstructured data, using extraction, integration, and user interaction. This paper describes the key information extraction (IE) challenges that we have run into, and sketches our solutions. We discuss in particular developing a declarative IE language, optimizing for this language, generating IE provenance, incorporating user feedback into the IE process, developing a novel wiki-based user interface for feedback, best-effort IE, pushing IE into RDBMSs, and more. Our work suggests that IE in managing unstructured data can open up many interesting research challenges, and that these challenges can greatly benefit from the wealth of work on managing structured data that has been carried out by the database community.}, acmid = {1519106}, address = {New York, NY, USA}, author = {Doan, AnHai and Naughton, Jeffrey F. and Ramakrishnan, Raghu and Baid, Akanksha and Chai, Xiaoyong and Chen, Fei and Chen, Ting and Chu, Eric and DeRose, Pedro and Gao, Byron and Gokhale, Chaitanya and Huang, Jiansheng and Shen, Warren and Vuong, Ba-Quy}, doi = {10.1145/1519103.1519106}, interhash = {b80d6ce47b976503692def4e86b0097d}, intrahash = {fccc9f25a1c70cb71d3377a7ddfe1614}, issn = {0163-5808}, issue_date = {December 2008}, journal = {SIGMOD Record}, month = mar, number = 4, numpages = {7}, pages = {14--20}, publisher = {ACM}, title = {Information extraction challenges in managing unstructured data}, url = {http://doi.acm.org/10.1145/1519103.1519106}, volume = 37, year = 2009 } @article{doan2011crowdsourcing, abstract = {The practice of crowdsourcing is transforming the Web and giving rise to a new field.}, acmid = {1924442}, address = {New York, NY, USA}, author = {Doan, Anhai and Ramakrishnan, Raghu and Halevy, Alon Y.}, doi = {10.1145/1924421.1924442}, interhash = {6dbf364159ce568b92727145a3fca85e}, intrahash = {84f738a6efae5eb6612ea75e8616fecf}, issn = {0001-0782}, issue_date = {April 2011}, journal = {Communications of the ACM}, month = apr, number = 4, numpages = {11}, pages = {86--96}, publisher = {ACM}, title = {Crowdsourcing systems on the World-Wide Web}, url = {http://doi.acm.org/10.1145/1924421.1924442}, volume = 54, year = 2011 } @inproceedings{zhang1996birch, acmid = {233324}, address = {New York, NY, USA}, author = {Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron}, booktitle = {Proceedings of the 1996 ACM SIGMOD International Conference on Management of Data}, doi = {10.1145/233269.233324}, interhash = {bd3d8e33e8785ecf66408081db016ca4}, intrahash = {250cecc10ceecd05a96bed00b6cf0fd7}, isbn = {0-89791-794-4}, location = {Montreal, Quebec, Canada}, numpages = {12}, pages = {103--114}, publisher = {ACM}, series = {SIGMOD '96}, title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases}, url = {http://doi.acm.org/10.1145/233269.233324}, year = 1996 } @inproceedings{zhang1996birch, acmid = {233324}, address = {New York, NY, USA}, author = {Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron}, booktitle = {Proceedings of the 1996 ACM SIGMOD International Conference on Management of Data}, doi = {10.1145/233269.233324}, interhash = {bd3d8e33e8785ecf66408081db016ca4}, intrahash = {250cecc10ceecd05a96bed00b6cf0fd7}, isbn = {0-89791-794-4}, location = {Montreal, Quebec, Canada}, numpages = {12}, pages = {103--114}, publisher = {ACM}, series = {SIGMOD '96}, title = {BIRCH: An Efficient Data Clustering Method for Very Large Databases}, url = {http://doi.acm.org/10.1145/233269.233324}, year = 1996 } @inproceedings{zhang96birch, author = {Zhang, Tian and Ramakrishnan, Raghu and Livny, Miron}, booktitle = {Proceedings of the 1996 ACM SIGMOD International Conference on Management of Data (SIGMOD'96)}, interhash = {bd3d8e33e8785ecf66408081db016ca4}, intrahash = {d8ede3f66d485d95578bdc3eeda11fc3}, pages = {103--114}, title = {{BIRCH}: an efficient data clustering method for very large databases}, url = {http://citeseer.ist.psu.edu/zhang96birch.html}, year = 1996 } @inproceedings{Cooper:2010:BCS:1807128.1807152, abstract = {While the use of MapReduce systems (such as Hadoop) for large scale data analysis has been widely recognized and studied, we have recently seen an explosion in the number of systems developed for cloud data serving. These newer systems address "cloud OLTP" applications, though they typically do not support ACID transactions. Examples of systems proposed for cloud serving use include BigTable, PNUTS, Cassandra, HBase, Azure, CouchDB, SimpleDB, Voldemort, and many others. Further, they are being applied to a diverse range of applications that differ considerably from traditional (e.g., TPC-C like) serving workloads. The number of emerging cloud serving systems and the wide range of proposed applications, coupled with a lack of apples-to-apples performance comparisons, makes it difficult to understand the tradeoffs between systems and the workloads for which they are suited. We present the "Yahoo! Cloud Serving Benchmark" (YCSB) framework, with the goal of facilitating performance comparisons of the new generation of cloud data serving systems. We define a core set of benchmarks and report results for four widely used systems: Cassandra, HBase, Yahoo!'s PNUTS, and a simple sharded MySQL implementation. We also hope to foster the development of additional cloud benchmark suites that represent other classes of applications by making our benchmark tool available via open source. In this regard, a key feature of the YCSB framework/tool is that it is extensible--it supports easy definition of new workloads, in addition to making it easy to benchmark new systems.}, acmid = {1807152}, address = {New York, NY, USA}, author = {Cooper, Brian F. and Silberstein, Adam and Tam, Erwin and Ramakrishnan, Raghu and Sears, Russell}, booktitle = {Proceedings of the 1st ACM symposium on Cloud computing}, doi = {10.1145/1807128.1807152}, interhash = {379999e8da039d731bfb9195691c08e8}, intrahash = {dd14b6e7abc247836d50af16e87fe5bb}, isbn = {978-1-4503-0036-0}, location = {Indianapolis, Indiana, USA}, numpages = {12}, pages = {143--154}, publisher = {ACM}, series = {SoCC '10}, title = {Benchmarking cloud serving systems with YCSB}, url = {http://doi.acm.org/10.1145/1807128.1807152}, year = 2010 }