@incollection{steenweg1992computers, address = {London}, author = {Steenweg, Helge}, booktitle = {CAA 91, Computer Applications and Quantitative Methods in Archaeology (BAR International Series S 577),}, editor = {Lock, G. and Moffet, J.}, interhash = {64da78b4eef1a0fd72e5e1509e3ddbdc}, intrahash = {f003e1c7e9dfb8dcea4e5c1f4c6ce580}, pages = {29-38.}, title = {Computers and Social History: Building a Database from Mediaeval Tax Registers for improved Information Retrieval in Göttingen}, year = 1992 } @article{larowe2009scholarly, abstract = {The Scholarly Database aims to serve researchers and practitioners interested in the analysis, modelling, and visualization of large-scale data sets. A specific focus of this database is to support macro-evolutionary studies of science and to communicate findings via knowledge-domain visualizations. Currently, the database provides access to about 18 million publications, patents, and grants. About 90% of the publications are available in full text. Except for some datasets with restricted access conditions, the data can be retrieved in raw or pre-processed formats using either a web-based or a relational database client. This paper motivates the need for the database from the perspective of bibliometric/scientometric research. It explains the database design, setup, etc., and reports the temporal, geographical, and topic coverage of data sets currently served via the database. Planned work and the potential for this database to become a global testbed for information science research are discussed at the end of the paper.}, author = {La Rowe, Gavin and Ambre, Sumeet and Burgoon, John and Ke, Weimao and Börner, Katy}, doi = {10.1007/s11192-009-0414-2}, interhash = {1819f263b0ea1b99ec15d0c22b38207e}, intrahash = {c24611ec1f2efbdcf7f5b26d49af320e}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 2, pages = {219--234}, publisher = {Springer Netherlands}, title = {The Scholarly Database and its utility for scientometrics research}, url = {http://dx.doi.org/10.1007/s11192-009-0414-2}, volume = 79, year = 2009 } @article{behm2011asterix, abstract = {ASTERIX is a new data-intensive storage and computing platform project spanning UC Irvine, UC Riverside, and UC San Diego. In this paper we provide an overview of the ASTERIX project, starting with its main goal—the storage and analysis of data pertaining to evolving-world models . We describe the requirements and associated challenges, and explain how the project is addressing them. We provide a technical overview of ASTERIX, covering its architecture, its user model for data and queries, and its approach to scalable query processing and data management. ASTERIX utilizes a new scalable runtime computational platform called Hyracks that is also discussed at an overview level; we have recently made Hyracks available in open source for use by other interested parties. We also relate our work on ASTERIX to the current state of the art and describe the research challenges that we are currently tackling as well as those that lie ahead.}, address = {Netherlands}, affiliation = {University of California, Irvine, USA}, author = {Behm, Alexander and Borkar, Vinayak and Carey, Michael and Grover, Raman and Li, Chen and Onose, Nicola and Vernica, Rares and Deutsch, Alin and Papakonstantinou, Yannis and Tsotras, Vassilis}, doi = {10.1007/s10619-011-7082-y}, interhash = {3e06363406f716c5d9340dc2c693adb3}, intrahash = {42d96cc4877943527a9259424c584740}, issn = {0926-8782}, journal = {Distributed and Parallel Databases}, keyword = {Computer Science}, number = 3, pages = {185--216}, publisher = {Springer}, title = {ASTERIX: towards a scalable, semistructured data platform for evolving-world models}, url = {http://dx.doi.org/10.1007/s10619-011-7082-y}, volume = 29, year = 2011 } @inproceedings{abiteboul1998incremental, abstract = {Semistructured data is not strictly typed like relational or object-oriented data and may be irregular or incomplete. It often arises in practice, e.g., when heterogeneous data sources are integrated or data is taken from the World Wide Web. Views over semistructured data can be used to filter the data and to restructure (or provide structure to) it. To achieve fast query response time, these views are often materialized. This paper studies incremental maintenance techniques for materialized views over semistructured data. We use the graph-based data model OEM and the query language Lorel, developed at Stanford, as the framework for our work. We propose a new algorithm that produces a set of queries that compute the changes to the view based upon a change to the source. We develop an analytic cost model and compare the cost of executing our incremental maintenance algorithm to that of recomputing the view. We show that for nearly all types of database updates, it is more efficient to apply our incremental maintenance algorithm to the view than to recompute the view from the database, even when there are thousands of such updates.}, author = {Abiteboul, S. and McHugh, J. and Rys, M. and Vassalos, V. and Wiener, J.}, booktitle = {24rd International Conference on Very Large Data Bases}, interhash = {b395f09383de5eb21d34ad8c2b39ab59}, intrahash = {32903b757b4b4d118c77f4aeac4b0d94}, month = aug, pages = {38--49}, publisher = {Morgan Kaufmann}, title = {Incremental Maintenance for Materialized Views over Semistructured Data}, url = {http://ilpubs.stanford.edu:8090/340/}, year = 1998 } @article{noy2004ontology, abstract = {As ontology development becomes a more ubiquitous and collaborative process, ontology versioning and evolution becomes an important area of ontology research. The many similarities between database-schema evolution and ontology evolution will allow us to build on the extensive research in schema evolution. However, there are also important differences between database schemas and ontologies. The differences stem from different usage paradigms, the presence of explicit semantics and different knowledge models. A lot of problems that existed only in theory in database research come to the forefront as practical problems in ontology evolution. These differences have important implications for the development of ontology-evolution frameworks: The traditional distinction between versioning and evolution is not applicable to ontologies. There are several dimensions along which compatibility between versions must be considered. The set of change operations for ontologies is different. We must develop automatic techniques for finding similarities and differences between versions.}, address = {London}, affiliation = {Stanford Medical Informatics Stanford University Stanford CA 94305 USA}, author = {Noy, Natalya F. and Klein, Michel}, doi = {10.1007/s10115-003-0137-2}, interhash = {4b4ee2090ba5356a3d0e853192968662}, intrahash = {08ee0381e240c3ee414e0eefc7fe1a83}, issn = {0219-1377}, journal = {Knowledge and Information Systems}, keyword = {Computer Science}, number = 4, pages = {428--440}, publisher = {Springer}, title = {Ontology Evolution: Not the Same as Schema Evolution}, url = {http://dx.doi.org/10.1007/s10115-003-0137-2}, volume = 6, year = 2004 } @article{sakr2012decade, abstract = {Database management technology has played a vital role in facilitating key advancements of the information technology field. Database researchers—and computer scientists in general—consider prestigious conferences as their favorite and effective tools for presenting their original research study and for getting good publicity. With the main aim of retaining the high quality and the prestige of these conference, program committee members plays the major role of evaluating the submitted articles and deciding which submissions are to be included in the conference programs. In this article, we study the program committees of four top-tier and prestigious database conferences (SIGMOD, VLDB, ICDE, EDBT) over a period of 10 years (2001–2010). We report about the growth in the number of program committee members in comparison to the size of the research community in the last decade. We also analyze the rate of change in the membership of the committees of the different editions of these conferences. Finally, we report about the major contributing scholars in the committees of these conferences as a mean of acknowledging their impact in the community.}, affiliation = {National ICT Australia and University of New South Wales, Sydney, Australia}, author = {Sakr, Sherif and Alomari, Mohammad}, doi = {10.1007/s11192-011-0530-7}, interhash = {33b8fb28ff8af21521b43a1b5ab5c0fd}, intrahash = {13310da323b890d3f9af4b8ef05ea9d5}, issn = {0138-9130}, journal = {Scientometrics}, keyword = {Computer Science}, number = 1, pages = {173-184}, publisher = {Akadémiai Kiadó, co-published with Springer Science+Business Media B.V., Formerly Kluwer Academic Publishers B.V.}, title = {A decade of database conferences: a look inside the program committees}, url = {http://dx.doi.org/10.1007/s11192-011-0530-7}, volume = 91, year = 2012 } @article{dean2008mapreduce, abstract = {MapReduce is a programming model and an associated implementation for processing and generating large datasets that is amenable to a broad variety of real-world tasks. Users specify the computation in terms of a map and a reduce function, and the underlying runtime system automatically parallelizes the computation across large-scale clusters of machines, handles machine failures, and schedules inter-machine communication to make efficient use of the network and disks. Programmers find the system easy to use: more than ten thousand distinct MapReduce programs have been implemented internally at Google over the past four years, and an average of one hundred thousand MapReduce jobs are executed on Google's clusters every day, processing a total of more than twenty petabytes of data per day.}, acmid = {1327492}, address = {New York, NY, USA}, author = {Dean, Jeffrey and Ghemawat, Sanjay}, doi = {10.1145/1327452.1327492}, interhash = {b8a00982bf087c8543855897b7362a04}, intrahash = {bff539224836d703c2d21141985fa1a3}, issn = {0001-0782}, issue_date = {January 2008}, journal = {Communications of the ACM}, month = jan, number = 1, numpages = {7}, pages = {107--113}, publisher = {ACM}, title = {MapReduce: simplified data processing on large clusters}, url = {http://doi.acm.org/10.1145/1327452.1327492}, volume = 51, year = 2008 } @inproceedings{marcus2011crowdsourced, abstract = {Amazon's Mechanical Turk (\MTurk") service allows users to post short tasks (\HITs") that other users can receive a small amount of money for completing. Common tasks on the system include labelling a collection of images, com- bining two sets of images to identify people which appear in both, or extracting sentiment from a corpus of text snippets. Designing a work ow of various kinds of HITs for ltering, aggregating, sorting, and joining data sources together is common, and comes with a set of challenges in optimizing the cost per HIT, the overall time to task completion, and the accuracy of MTurk results. We propose Qurk, a novel query system for managing these work ows, allowing crowd- powered processing of relational databases. We describe a number of query execution and optimization challenges, and discuss some potential solutions.}, author = {Marcus, Adam and Wu, Eugene and Madden, Samuel and Miller, Robert C.}, booktitle = {Proceedings of the 5th Biennial Conference on Innovative Data Systems Research}, doi = {1721.1/62827}, interhash = {b6b7d67c3c09259fb2d5df3f52e24c9d}, intrahash = {29723ba38aa6039091769cd2f69a1514}, month = jan, pages = {211--214}, publisher = {CIDR}, title = {Crowdsourced Databases: Query Processing with People}, url = {http://dspace.mit.edu/handle/1721.1/62827}, year = 2011 } @inproceedings{franklin2011crowddb, abstract = {Some queries cannot be answered by machines only. Processing such queries requires human input for providing information that is missing from the database, for performing computationally difficult functions, and for matching, ranking, or aggregating results based on fuzzy criteria. CrowdDB uses human input via crowdsourcing to process queries that neither database systems nor search engines can adequately answer. It uses SQL both as a language for posing complex queries and as a way to model data. While CrowdDB leverages many aspects of traditional database systems, there are also important differences. Conceptually, a major change is that the traditional closed-world assumption for query processing does not hold for human input. From an implementation perspective, human-oriented query operators are needed to solicit, integrate and cleanse crowdsourced data. Furthermore, performance and cost depend on a number of new factors including worker affinity, training, fatigue, motivation and location. We describe the design of CrowdDB, report on an initial set of experiments using Amazon Mechanical Turk, and outline important avenues for future work in the development of crowdsourced query processing systems.}, acmid = {1989331}, address = {New York, NY, USA}, author = {Franklin, Michael J. and Kossmann, Donald and Kraska, Tim and Ramesh, Sukriti and Xin, Reynold}, booktitle = {Proceedings of the 2011 international conference on Management of data}, doi = {10.1145/1989323.1989331}, interhash = {8a3f1b0fb94083c918960f1e756fe496}, intrahash = {9525ebea13b41f27a49bafcf2f1132c6}, isbn = {978-1-4503-0661-4}, location = {Athens, Greece}, numpages = {12}, pages = {61--72}, publisher = {ACM}, title = {CrowdDB: answering queries with crowdsourcing}, url = {http://doi.acm.org/10.1145/1989323.1989331}, year = 2011 } @inproceedings{parameswaran2011answering, abstract = {For some problems, human assistance is needed in addition to automated (algorithmic) computation. In sharp contrast to existing data management approaches, where human input is either ad-hoc or is never used, we describe the design of the first declarative language involving human-computable functions, standard relational operators, as well as algorithmic computation. We consider the challenges involved in optimizing queries posed in this language, in particular, the tradeoffs between uncertainty, cost and performance, as well as combination of human and algorithmic evidence. We believe that the vision laid out in this paper can act as a road-map for a new area of data management research where human computation is routinely used in data analytics.}, author = {Parameswaran, Aditya and Polyzotis, Neoklis}, booktitle = {Conference on Inovative Data Systems Research (CIDR 2011)}, interhash = {037601fdcba1c499a3e89b1427235489}, intrahash = {8c11ab0f21767c79cd694a795eddf169}, month = jan, pages = {160--166}, title = {Answering Queries using Humans, Algorithms and Databases}, url = {http://ilpubs.stanford.edu:8090/986/}, year = 2011 } @article{selke2012pushing, abstract = {By incorporating human workers into the query execution process crowd-enabled databases facilitate intelligent, social capabilities like completing missing data at query time or performing cognitive operators. But despite all their flexibility, crowd-enabled databases still maintain rigid schemas. In this paper, we extend crowd-enabled databases by flexible query-driven schema expansion, allowing the addition of new attributes to the database at query time. However, the number of crowd-sourced mini-tasks to fill in missing values may often be prohibitively large and the resulting data quality is doubtful. Instead of simple crowd-sourcing to obtain all values individually, we leverage the usergenerated data found in the Social Web: By exploiting user ratings we build perceptual spaces, i.e., highly-compressed representations of opinions, impressions, and perceptions of large numbers of users. Using few training samples obtained by expert crowd sourcing, we then can extract all missing data automatically from the perceptual space with high quality and at low costs. Extensive experiments show that our approach can boost both performance and quality of crowd-enabled databases, while also providing the flexibility to expand schemas in a query-driven fashion.}, acmid = {2168655}, author = {Selke, Joachim and Lofi, Christoph and Balke, Wolf-Tilo}, interhash = {8d2c0e1e49d00f11fa124deeea4a7dbe}, intrahash = {41224a60badfeefb0fe2cea85f2a4ff0}, issn = {2150-8097}, issue_date = {February 2012}, journal = {Proceedings of the VLDB Endowment}, month = feb, number = 6, numpages = {12}, pages = {538--549}, publisher = {VLDB Endowment}, title = {Pushing the boundaries of crowd-enabled databases with query-driven schema expansion}, url = {http://dl.acm.org/citation.cfm?id=2168651.2168655}, volume = 5, year = 2012 } @techreport{parameswaran2011declarative, abstract = {Crowdsourcing enables programmers to incorporate ``human computation'' as a building block in algorithms that cannot be fully automated, such as text analysis and image recognition. Similarly, humans can be used as a building block in data-intensive applications --- providing, comparing, and verifying data used by applications. Building upon the decades-long success of declarative approaches to conventional data management, we use a similar approach for data-intensive applications that incorporate humans. Specifically, declarative queries are posed over stored relational data as well as data computed on-demand from the crowd, and the underlying system orchestrates the computation of query answers. We present Deco, a database system for declarative crowdsourcing. We describe Deco's data model, query language, and our initial prototype. Deco's data model was designed to be general (it can be instantiated to other proposed models), flexible (it allows methods for uncertainty resolution and external access to be plugged in), and principled (it has a precisely-defined semantics). Syntactically, Deco's query language is a simple extension to SQL. Based on Deco's data model, we define a precise semantics for arbitrary queries involving both stored data and data obtained from the crowd. We then describe the Deco query processor, which respects our semantics while coping with the unique combination of latency, monetary cost, and uncertainty introduced in the crowdsourcing environment. Finally, we describe our current system implementation, and we discuss the novel query optimization challenges that form the core of our ongoing work.}, author = {Parameswaran, Aditya and Park, Hyunjung and Garcia-Molina, Hector and Polyzotis, Neoklis and Widom, Jennifer}, institution = {Stanford University}, interhash = {af28066d0b21d87a9ef90f63d7e6095f}, intrahash = {4de5dd97e5466c9f1fc63c0d23b4d90a}, number = 1015, publisher = {Stanford InfoLab}, title = {Deco: Declarative Crowdsourcing}, url = {http://ilpubs.stanford.edu:8090/1015/}, year = 2011 } @article{oezsu2009encyclopedia, address = {New York; London}, author = {Özsu, M. Tame and Liu, Lin}, interhash = {fb7575bbb11dcbec18b1d8fc3d8ba603}, intrahash = {bcb7a734a91759aee90015798b3a8f77}, isbn = {9780387399409 0387399402 9780387355443 0387355448}, pages = {--}, publisher = {Springer}, refid = {489216188}, title = {Encyclopedia of database systems}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9780387496160}, year = 2009 } @book{groppe2011management, asin = {3642193560}, author = {Groppe, Sven}, dewey = {006}, ean = {9783642193569}, edition = {1., st Edition.}, interhash = {10c42c5bdb7216ce1485b5cc3b52b90c}, intrahash = {e42a3f87c913af2fde57270aed9c31db}, isbn = {3642193560}, publisher = {Springer, Berlin}, title = {Data Management and Query Processing in Semantic Web Databases}, url = {http://www.amazon.de/Management-Query-Processing-Semantic-Databases/dp/3642193560}, year = 2011 } @inproceedings{Cooper:2010:BCS:1807128.1807152, abstract = {While the use of MapReduce systems (such as Hadoop) for large scale data analysis has been widely recognized and studied, we have recently seen an explosion in the number of systems developed for cloud data serving. These newer systems address "cloud OLTP" applications, though they typically do not support ACID transactions. Examples of systems proposed for cloud serving use include BigTable, PNUTS, Cassandra, HBase, Azure, CouchDB, SimpleDB, Voldemort, and many others. Further, they are being applied to a diverse range of applications that differ considerably from traditional (e.g., TPC-C like) serving workloads. The number of emerging cloud serving systems and the wide range of proposed applications, coupled with a lack of apples-to-apples performance comparisons, makes it difficult to understand the tradeoffs between systems and the workloads for which they are suited. We present the "Yahoo! Cloud Serving Benchmark" (YCSB) framework, with the goal of facilitating performance comparisons of the new generation of cloud data serving systems. We define a core set of benchmarks and report results for four widely used systems: Cassandra, HBase, Yahoo!'s PNUTS, and a simple sharded MySQL implementation. We also hope to foster the development of additional cloud benchmark suites that represent other classes of applications by making our benchmark tool available via open source. In this regard, a key feature of the YCSB framework/tool is that it is extensible--it supports easy definition of new workloads, in addition to making it easy to benchmark new systems.}, acmid = {1807152}, address = {New York, NY, USA}, author = {Cooper, Brian F. and Silberstein, Adam and Tam, Erwin and Ramakrishnan, Raghu and Sears, Russell}, booktitle = {Proceedings of the 1st ACM symposium on Cloud computing}, doi = {10.1145/1807128.1807152}, interhash = {379999e8da039d731bfb9195691c08e8}, intrahash = {dd14b6e7abc247836d50af16e87fe5bb}, isbn = {978-1-4503-0036-0}, location = {Indianapolis, Indiana, USA}, numpages = {12}, pages = {143--154}, publisher = {ACM}, series = {SoCC '10}, title = {Benchmarking cloud serving systems with YCSB}, url = {http://doi.acm.org/10.1145/1807128.1807152}, year = 2010 } @book{gunter2006algorithmen, address = {Heidelberg}, author = {Saake, Gunter. and Sattler, Kai-Uwe.}, interhash = {4d8fb7156345c2b744b1f683dccea05c}, intrahash = {045074abdbde78dec786ca7743bc89d3}, isbn = {3898643859 9783898643856}, pages = {--}, publisher = {Dpunkt-Verl.}, refid = {179970932}, title = {Algorithmen und Datenstrukturen : eine Einführung mit Java}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9783898643856}, year = 2006 } @book{m1995datenbankeinsatz, address = {Berlin; New York}, author = {Lang, Stefan M. and Lockemann, P. C.}, interhash = {442a24e7ca9be3f30956c546f2e4b25e}, intrahash = {11f80ad0e0568c05f415dad59b2c9a36}, isbn = {3540585583 9783540585589}, pages = {--}, publisher = {Springer-Verlag}, refid = {33057030}, title = {Datenbankeinsatz}, url = {http://www.worldcat.org/search?qt=worldcat_org_all&q=9783540585589}, year = 1995 } @incollection{steenweg1992computers, address = {London}, author = {Steenweg, Helge}, booktitle = {CAA 91, Computer Applications and Quantitative Methods in Archaeology (BAR International Series S 577),}, editor = {Lock, G. and Moffet, J.}, interhash = {64da78b4eef1a0fd72e5e1509e3ddbdc}, intrahash = {f003e1c7e9dfb8dcea4e5c1f4c6ce580}, pages = {29-38.}, title = {Computers and Social History: Building a Database from Mediaeval Tax Registers for improved Information Retrieval in Göttingen}, year = 1992 } @book{Sherman.2005, address = {Medford, NJ}, author = {Sherman, Chris and Price, Gary}, edition = {5. print.}, interhash = {dc1ce7416af96d226839fb081abaea90}, intrahash = {55f48797c52a742aa8e6a42c3a73005e}, isbn = {0910965514}, publisher = {Information Today}, series = {CyberAge books}, title = {The invisible web: Uncovering information sources search engines can't see}, year = 2005 } @book{Rockley.2003, address = {Berkley, Calif.}, author = {Rockley, Ann and Kostur, Pamela and Manning, Steve}, interhash = {02e7298bef71bde4e2297617945699bb}, intrahash = {871e44acb13d17c61674cfc9566e7b9e}, isbn = {0735713065}, publisher = {New Riders}, title = {Managing enterprise content: A unified content strategy}, year = 2003 }