@article{bechhofer2013linked, abstract = {Scientific data represents a significant portion of the linked open data cloud and scientists stand to benefit from the data fusion capability this will afford. Publishing linked data into the cloud, however, does not ensure the required reusability. Publishing has requirements of provenance, quality, credit, attribution and methods to provide the reproducibility that enables validation of results. In this paper we make the case for a scientific data publication model on top of linked data and introduce the notion of Research Objects as first class citizens for sharing and publishing.}, author = {Bechhofer, Sean and Buchan, Iain and De Roure, David and Missier, Paolo and Ainsworth, John and Bhagat, Jiten and Couch, Philip and Cruickshank, Don and Delderfield, Mark and Dunlop, Ian and Gamble, Matthew and Michaelides, Danius and Owen, Stuart and Newman, David and Sufi, Shoaib and Goble, Carole}, doi = {10.1016/j.future.2011.08.004}, interhash = {8df8b7069a622aa2eae6d74e5fdc0a6b}, intrahash = {f500b67a045765125183e23c827991d2}, issn = {0167-739X}, journal = {Future Generation Computer Systems}, number = 2, pages = {599--611}, title = {Why linked data is not enough for scientists}, url = {http://www.sciencedirect.com/science/article/pii/S0167739X11001439}, volume = 29, year = 2013 } @inproceedings{vandesompel2010httpbased, abstract = {Dereferencing a URI returns a representation of the current state of the resource identified by that URI. But, on the Web representations of prior states of a resource are also available, for example, as resource versions in Content Management Systems or archival resources in Web Archives such as the Internet Archive. This paper introduces a resource versioning mechanism that is fully based on HTTP and uses datetime as a global version indicator. The approach allows "follow your nose" style navigation both from the current time-generic resource to associated time-specific version resources as well as among version resources. The proposed versioning mechanism is congruent with the Architecture of the World Wide Web, and is based on the Memento framework that extends HTTP with transparent content negotiation in the datetime dimension. The paper shows how the versioning approach applies to Linked Data, and by means of a demonstrator built for DBpedia, it also illustrates how it can be used to conduct a time-series analysis across versions of Linked Data descriptions.}, author = {Van de Sompel, Herbert and Sanderson, Robert and Nelson, Michael L. and Balakireva, Lyudmila L. and Shankar, Harihar and Ainsworth, Scott}, booktitle = {Proceedings of Linked Data on the Web (LDOW2010)}, interhash = {0c517e7799d2c2da3f9b2a0daff27885}, intrahash = {8f9405e8056dd827d9c72a48e229a65a}, number = {1003.3661}, publisher = {arXiv}, series = {cs.DL}, title = {An HTTP-Based Versioning Mechanism for Linked Data}, url = {http://arxiv.org/abs/1003.3661}, year = 2010 } @incollection{rula2012diversity, abstract = {An increasing amount of data is published and consumed on the Web according to the Linked Data paradigm. In consideration of both publishers and consumers, the temporal dimension of data is important. In this paper we investigate the characterisation and availability of temporal information in Linked Data at large scale. Based on an abstract definition of temporal information we conduct experiments to evaluate the availability of such information using the data from the 2011 Billion Triple Challenge (BTC) dataset. Focusing in particular on the representation of temporal meta-information, i.e., temporal information associated with RDF statements and graphs, we investigate the approaches proposed in the literature, performing both a quantitative and a qualitative analysis and proposing guidelines for data consumers and publishers. Our experiments show that the amount of temporal information available in the LOD cloud is still very small; several different models have been used on different datasets, with a prevalence of approaches based on the annotation of RDF documents.}, address = {Berlin/Heidelberg}, author = {Rula, Anisa and Palmonari, Matteo and Harth, Andreas and Stadtmüller, Steffen and Maurino, Andrea}, booktitle = {The Semantic Web – ISWC 2012}, doi = {10.1007/978-3-642-35176-1_31}, editor = {Cudré-Mauroux, Philippe and Heflin, Jeff and Sirin, Evren and Tudorache, Tania and Euzenat, Jérôme and Hauswirth, Manfred and Parreira, JosianeXavier and Hendler, Jim and Schreiber, Guus and Bernstein, Abraham and Blomqvist, Eva}, interhash = {ea17ab98217d3ed32b06425a83fb25ab}, intrahash = {2bf73337f9b2ca5abc5e07d1ee48cc30}, isbn = {978-3-642-35175-4}, pages = {492--507}, publisher = {Springer }, series = {Lecture Notes in Computer Science}, title = {On the Diversity and Availability of Temporal Information in Linked Open Data}, url = {http://dx.doi.org/10.1007/978-3-642-35176-1_31}, volume = 7649, year = 2012 } @article{bernerslee2013readwrite, abstract = {This paper discusses issues that will affect the future development of the Web, either increasing its power and utility, or alternatively suppressing its development. It argues for the importance of the continued development of the Linked Data Web, and describes the use of linked open data as an important component of that. Second, the paper defends the Web as a read–write medium, and goes on to consider how the read–write Linked Data Web could be achieved.}, author = {Berners-Lee, Tim and O’Hara, Kieron}, doi = {10.1098/rsta.2012.0513}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120513.full.pdf+html}, interhash = {d7441404d63f5e6303e1c17f0aa27a8c}, intrahash = {9ec5e708342fac1e2ea2726cb7e2acd8}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, number = 1987, title = {The read–write Linked Data Web}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120513.abstract}, volume = 371, year = 2013 } @article{karger2013standards, abstract = {The evolving Web has seen ever-growing use of structured data, thanks to the way it enhances information authoring, querying, visualization and sharing. To date, however, most structured data authoring and management tools have been oriented towards programmers and Web developers. End users have been left behind, unable to leverage structured data for information management and communication as well as professionals. In this paper, I will argue that many of the benefits of structured data management can be provided to end users as well. I will describe an approach and tools that allow end users to define their own schemas (without knowing what a schema is), manage data and author (not program) interactive Web visualizations of that data using the Web tools with which they are already familiar, such as plain Web pages, blogs, wikis and WYSIWYG document editors. I will describe our experience deploying these tools and some lessons relevant to their future evolution.}, author = {Karger, David}, doi = {10.1098/rsta.2012.0381}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120381.full.pdf+html}, interhash = {587a510fb2d55abda118fc8e08309e4c}, intrahash = {90d25a4bcdb5dcd12190f8823f086a02}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, month = mar, number = 1987, title = {Standards opportunities around data-bearing Web pages}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120381.abstract}, volume = 371, year = 2013 } @article{bizer2009dbpedia, abstract = {The DBpedia project is a community effort to extract structured information from Wikipedia and to make this information accessible on the Web. The resulting DBpedia knowledge base currently describes over 2.6 million entities. For each of these entities, DBpedia defines a globally unique identifier that can be dereferenced over the Web into a rich RDF description of the entity, including human-readable definitions in 30 languages, relationships to other resources, classifications in four concept hierarchies, various facts as well as data-level links to other Web data sources describing the entity. Over the last year, an increasing number of data publishers have begun to set data-level links to DBpedia resources, making DBpedia a central interlinking hub for the emerging Web of Data. Currently, the Web of interlinked data sources around DBpedia provides approximately 4.7 billion pieces of information and covers domains such as geographic information, people, companies, films, music, genes, drugs, books, and scientific publications. This article describes the extraction of the DBpedia knowledge base, the current status of interlinking DBpedia with other data sources on the Web, and gives an overview of applications that facilitate the Web of Data around DBpedia.}, author = {Bizer, Christian and Lehmann, Jens and Kobilarov, Georgi and Auer, Sören and Becker, Christian and Cyganiak, Richard and Hellmann, Sebastian}, doi = {10.1016/j.websem.2009.07.002}, interhash = {087f766f30469cbc881c83ad156a104a}, intrahash = {560097dc36a8e66b69db5cb22c1fa334}, issn = {1570-8268}, journal = {Web Semantics: Science, Services and Agents on the World Wide Web}, number = 3, pages = {154--165}, title = {DBpedia - A crystallization point for the Web of Data}, url = {http://www.sciencedirect.com/science/article/pii/S1570826809000225}, volume = 7, year = 2009 } @inproceedings{suchanek2007semantic, abstract = {We present YAGO, a light-weight and extensible ontology with high coverage and quality. YAGO builds on entities and relations and currently contains more than 1 million entities and 5 million facts. This includes the Is-A hierarchy as well as non-taxonomic relations between entities (such as HASONEPRIZE). The facts have been automatically extracted from Wikipedia and unified with WordNet, using a carefully designed combination of rule-based and heuristic methods described in this paper. The resulting knowledge base is a major step beyond WordNet: in quality by adding knowledge about individuals like persons, organizations, products, etc. with their semantic relationships - and in quantity by increasing the number of facts by more than an order of magnitude. Our empirical evaluation of fact correctness shows an accuracy of about 95%. YAGO is based on a logically clean model, which is decidable, extensible, and compatible with RDFS. Finally, we show how YAGO can be further extended by state-of-the-art information extraction techniques.}, acmid = {1242667}, address = {New York, NY, USA}, author = {Suchanek, Fabian M. and Kasneci, Gjergji and Weikum, Gerhard}, booktitle = {Proceedings of the 16th international conference on World Wide Web}, doi = {10.1145/1242572.1242667}, interhash = {1d2c2b23ce2a6754d12c4364e19c574c}, intrahash = {84ae693c0a6dfb6d4b051b0b6dbd3668}, isbn = {978-1-59593-654-7}, location = {Banff, Alberta, Canada}, numpages = {10}, pages = {697--706}, publisher = {ACM}, title = {YAGO: a core of semantic knowledge}, url = {http://doi.acm.org/10.1145/1242572.1242667}, year = 2007 } @incollection{auer2007dbpedia, abstract = {DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data. We describe the extraction of the DBpedia datasets, and how the resulting information is published on the Web for human- and machine-consumption. We describe some emerging applications from the DBpedia community and show how website authors can facilitate DBpedia content within their sites. Finally, we present the current status of interlinking DBpedia with other open datasets on the Web and outline how DBpedia could serve as a nucleus for an emerging Web of open data.}, address = {Berlin/Heidelberg}, author = {Auer, Sören and Bizer, Christian and Kobilarov, Georgi and Lehmann, Jens and Cyganiak, Richard and Ives, Zachary}, booktitle = {The Semantic Web}, doi = {10.1007/978-3-540-76298-0_52}, editor = {Aberer, Karl and Choi, Key-Sun and Noy, Natasha and Allemang, Dean and Lee, Kyung-Il and Nixon, Lyndon and Golbeck, Jennifer and Mika, Peter and Maynard, Diana and Mizoguchi, Riichiro and Schreiber, Guus and Cudré-Mauroux, Philippe}, interhash = {ba9f8a17de78f7864934ddb96afa67df}, intrahash = {b00f9f95ba1970164ad70aa227719c6e}, isbn = {978-3-540-76297-3}, pages = {722--735}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {DBpedia: A Nucleus for a Web of Open Data}, url = {http://dx.doi.org/10.1007/978-3-540-76298-0_52}, volume = 4825, year = 2007 } @inproceedings{pereiranunes2012entities, abstract = {The richness of the (Semantic) Web lies in its ability to link related resources as well as data across the Web. However, while relations within particular datasets are often well defined, links between disparate datasets and corpora of Web resources are rare. The increasingly widespread use of cross-domain reference datasets, such as Freebase and DBpedia for annotating and enriching datasets as well as document corpora, opens up opportunities to exploit their inherent semantics to uncover semantic relationships between disparate resources. In this paper, we present an approach to uncover relationships between disparate entities by analyzing the graphs of used reference datasets. We adapt a relationship assessment methodology from social network theory to measure the connectivity between entities in reference datasets and exploit these measures to identify correlated Web resources. Finally, we present an evaluation of our approach using the publicly available datasets Bibsonomy and USAToday. }, author = {Pereira Nunes, Bernardo and Kawase, Ricardo and Dietze, Stefan and Taibi, Davide and Casanova, Marco Antonio and Nejdl, Wolfgang}, booktitle = {Proceedings of the Web of Linked Entities Workshop in conjuction with the 11th International Semantic Web Conference}, editor = {Rizzo, Giuseppe and Mendes, Pablo and Charton, Eric and Hellmann, Sebastian and Kalyanpur, Aditya}, institution = {Bernardo Pereira Nunes, Ricardo Kawase, Stefan Dietze, Davide Taibi, Marco Antonio Casanova, Wolfgang Nejdl}, interhash = {8f969b917268449792c130dcbab06e69}, intrahash = {f22943239296ada0dfa11c30c5b4904a}, issb = {1613-0073}, month = nov, pages = {45--57}, series = {CEUR-WS.org}, title = {Can Entities be Friends?}, url = {http://ceur-ws.org/Vol-906/paper6.pdf}, urn = {urn:nbn:de:0074-906-7}, volume = 906, year = 2012 } @inproceedings{joachims2002optimizing, abstract = {This paper presents an approach to automatically optimizing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous approaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them difficult and expensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be recorded at very low cost. Taking a Support Vector Machine (SVM) approach, this paper presents a method for learning retrieval functions. From a theoretical perspective, this method is shown to be well-founded in a risk minimization framework. Furthermore, it is shown to be feasible even for large sets of queries and features. The theoretical results are verified in a controlled experiment. It shows that the method can effectively adapt the retrieval function of a meta-search engine to a particular group of users, outperforming Google in terms of retrieval quality after only a couple of hundred training examples.}, acmid = {775067}, address = {New York, NY, USA}, author = {Joachims, Thorsten}, booktitle = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {10.1145/775047.775067}, interhash = {c78df69370bbf12636eaa5233b1fba83}, intrahash = {656a83f1057c5792506d0d656ae81d26}, isbn = {1-58113-567-X}, location = {Edmonton, Alberta, Canada}, numpages = {10}, pages = {133--142}, publisher = {ACM}, title = {Optimizing search engines using clickthrough data}, url = {http://doi.acm.org/10.1145/775047.775067}, year = 2002 } @inproceedings{joachims2005accurately, abstract = {This paper examines the reliability of implicit feedback generated from clickthrough data in WWW search. Analyzing the users' decision process using eyetracking and comparing implicit feedback against manual relevance judgments, we conclude that clicks are informative but biased. While this makes the interpretation of clicks as absolute relevance judgments difficult, we show that relative preferences derived from clicks are reasonably accurate on average.}, acmid = {1076063}, address = {New York, NY, USA}, author = {Joachims, Thorsten and Granka, Laura and Pan, Bing and Hembrooke, Helene and Gay, Geri}, booktitle = {Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1076034.1076063}, interhash = {050982b76855a6b1258ed0b40cb69018}, intrahash = {8c488477626fa59db419ac77f3552029}, isbn = {1-59593-034-5}, location = {Salvador, Brazil}, numpages = {8}, pages = {154--161}, publisher = {ACM}, title = {Accurately interpreting clickthrough data as implicit feedback}, url = {http://doi.acm.org/10.1145/1076034.1076063}, year = 2005 } @inproceedings{martins2008extracting, abstract = {Geo-temporal criteria are important for filtering, grouping and prioritizing information resources. This presents techniques for extracting semantic geo-temporal information from text, using simple text mining methods that leverage on a gazetteer. A prototype system, implementing the proposed methods and capable of displaying information over maps and timelines, is described. This prototype can take input in RSS, demonstrating the application to content from many different online sources. Experimental results demonstrate the efficiency and accuracy of the proposed approaches.}, author = {Martins, B. and Manguinhas, H. and Borbinha, J.}, booktitle = {Proceedings of the International Conference on Semantic Computing}, doi = {10.1109/ICSC.2008.86}, interhash = {d03fecb6b3261ffa0a5e11789b188883}, intrahash = {5a889bc7d9e81cb1d294cb83b767bf64}, month = aug, pages = {1--9}, publisher = {IEEE Computer Society}, title = {Extracting and Exploring the Geo-Temporal Semantics of Textual Resources}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4597167}, year = 2008 } @article{goodwin2008geographical, abstract = {Ordnance Survey, the national mapping agency of Great Britain, is investigating how semantic web technologies assist its role as a geographical information provider. A major part of this work involves the development of prototype products and datasets in RDF. This article discusses the production of an example dataset for the administrative geography of Great Britain, demonstrating the advantages of explicitly encoding topological relations between geographic entities over traditional spatial queries. We also outline how these data can be linked to other datasets on the web of linked data and some of the challenges that this raises.}, author = {Goodwin, John and Dolbear, Catherine and Hart, Glen}, doi = {10.1111/j.1467-9671.2008.01133.x}, interhash = {ea248d549690eceb8e7aa06ccb24e226}, intrahash = {08412bb4afca1e86d0cca0a8a083f2a2}, issn = {1467-9671}, journal = {Transactions in GIS}, pages = {19--30}, publisher = {Blackwell Publishing Ltd}, title = {Geographical Linked Data: The Administrative Geography of Great Britain on the Semantic Web}, url = {http://dx.doi.org/10.1111/j.1467-9671.2008.01133.x}, volume = 12, year = 2008 } @article{liu2012crowdsourcing, abstract = {Some complex problems, such as image tagging and natural language processing, are very challenging for computers, where even state-of-the-art technology is yet able to provide satisfactory accuracy. Therefore, rather than relying solely on developing new and better algorithms to handle such tasks, we look to the crowdsourcing solution -- employing human participation -- to make good the shortfall in current technology. Crowdsourcing is a good supplement to many computer tasks. A complex job may be divided into computer-oriented tasks and human-oriented tasks, which are then assigned to machines and humans respectively.

To leverage the power of crowdsourcing, we design and implement a Crowdsourcing Data Analytics System, CDAS. CDAS is a framework designed to support the deployment of various crowdsourcing applications. The core part of CDAS is a quality-sensitive answering model, which guides the crowdsourcing engine to process and monitor the human tasks. In this paper, we introduce the principles of our quality-sensitive model. To satisfy user required accuracy, the model guides the crowdsourcing query engine for the design and processing of the corresponding crowdsourcing jobs. It provides an estimated accuracy for each generated result based on the human workers' historical performances. When verifying the quality of the result, the model employs an online strategy to reduce waiting time. To show the effectiveness of the model, we implement and deploy two analytics jobs on CDAS, a twitter sentiment analytics job and an image tagging job. We use real Twitter and Flickr data as our queries respectively. We compare our approaches with state-of-the-art classification and image annotation techniques. The results show that the human-assisted methods can indeed achieve a much higher accuracy. By embedding the quality-sensitive model into crowdsourcing query engine, we effectively reduce the processing cost while maintaining the required query answer quality.}, acmid = {2336676}, author = {Liu, Xuan and Lu, Meiyu and Ooi, Beng Chin and Shen, Yanyan and Wu, Sai and Zhang, Meihui}, interhash = {41ad6e73b03373d76d3164ba248335d7}, intrahash = {2091967734f96c4afbc09319d48a8c65}, issn = {2150-8097}, issue_date = {June 2012}, journal = {Proceedings of the VLDB Endowment}, month = jun, number = 10, numpages = {12}, pages = {1040--1051}, publisher = {VLDB Endowment}, title = {CDAS: a crowdsourcing data analytics system}, url = {http://dl.acm.org/citation.cfm?id=2336664.2336676}, volume = 5, year = 2012 } @article{muniswamyreddy2010provenance, abstract = {Digital provenance is meta-data that describes the ancestry or history of a digital object. Most work on provenance focuses on how provenance increases the value of data to consumers. However, provenance is also valuable to storage providers. For example, provenance can provide hints on access patterns, detect anomalous behavior, and provide enhanced user search capabilities. As the next generation storage providers, cloud vendors are in the unique position to capitalize on this opportunity to incorporate provenance as a fundamental storage system primitive. To date, cloud offerings have not yet done so. We provide motivation for providers to treat provenance as first class data in the cloud and based on our experience with provenance in a local storage system, suggest a set of requirements that make provenance feasible and attractive.}, acmid = {1713258}, address = {New York, NY, USA}, author = {Muniswamy-Reddy, Kiran-Kumar and Seltzer, Margo}, doi = {10.1145/1713254.1713258}, interhash = {6fb5af3426b91f7e460d99746b3358c2}, intrahash = {1f9f6761cab2437739d30b9636ba5531}, issn = {0163-5980}, issue_date = {January 2010}, journal = {SIGOPS Operating Systems Review}, month = jan, number = 4, numpages = {6}, pages = {11--16}, publisher = {ACM}, title = {Provenance as first class cloud data}, url = {http://doi.acm.org/10.1145/1713254.1713258}, volume = 43, year = 2010 } @article{alsubaiee2012asterix, abstract = {At UC Irvine, we are building a next generation parallel database system, called ASTERIX, as our approach to addressing today's "Big Data" management challenges. ASTERIX aims to combine time-tested principles from parallel database systems with those of the Web-scale computing community, such as fault tolerance for long running jobs. In this demo, we present a whirlwind tour of ASTERIX, highlighting a few of its key features. We will demonstrate examples of our data definition language to model semi-structured data, and examples of interesting queries using our declarative query language. In particular, we will show the capabilities of ASTERIX for answering geo-spatial queries and fuzzy queries, as well as ASTERIX' data feed construct for continuously ingesting data.}, acmid = {2367532}, author = {Alsubaiee, Sattam and Altowim, Yasser and Altwaijry, Hotham and Behm, Alexander and Borkar, Vinayak and Bu, Yingyi and Carey, Michael and Grover, Raman and Heilbron, Zachary and Kim, Young-Seok and Li, Chen and Onose, Nicola and Pirzadeh, Pouria and Vernica, Rares and Wen, Jian}, interhash = {ae521b66302adb1b7df3f4cdb8d92181}, intrahash = {003f2654ae41861cfb77bf0353634ac3}, issn = {2150-8097}, issue_date = {August 2012}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 12, numpages = {4}, pages = {1898--1901}, publisher = {VLDB Endowment}, title = {ASTERIX: an open source system for "Big Data" management and analysis (demo)}, url = {http://dl.acm.org/citation.cfm?id=2367502.2367532}, volume = 5, year = 2012 } @article{behm2011asterix, abstract = {ASTERIX is a new data-intensive storage and computing platform project spanning UC Irvine, UC Riverside, and UC San Diego. In this paper we provide an overview of the ASTERIX project, starting with its main goal—the storage and analysis of data pertaining to evolving-world models . We describe the requirements and associated challenges, and explain how the project is addressing them. We provide a technical overview of ASTERIX, covering its architecture, its user model for data and queries, and its approach to scalable query processing and data management. ASTERIX utilizes a new scalable runtime computational platform called Hyracks that is also discussed at an overview level; we have recently made Hyracks available in open source for use by other interested parties. We also relate our work on ASTERIX to the current state of the art and describe the research challenges that we are currently tackling as well as those that lie ahead.}, address = {Netherlands}, affiliation = {University of California, Irvine, USA}, author = {Behm, Alexander and Borkar, Vinayak and Carey, Michael and Grover, Raman and Li, Chen and Onose, Nicola and Vernica, Rares and Deutsch, Alin and Papakonstantinou, Yannis and Tsotras, Vassilis}, doi = {10.1007/s10619-011-7082-y}, interhash = {3e06363406f716c5d9340dc2c693adb3}, intrahash = {42d96cc4877943527a9259424c584740}, issn = {0926-8782}, journal = {Distributed and Parallel Databases}, keyword = {Computer Science}, number = 3, pages = {185--216}, publisher = {Springer}, title = {ASTERIX: towards a scalable, semistructured data platform for evolving-world models}, url = {http://dx.doi.org/10.1007/s10619-011-7082-y}, volume = 29, year = 2011 } @inproceedings{abiteboul1998incremental, abstract = {Semistructured data is not strictly typed like relational or object-oriented data and may be irregular or incomplete. It often arises in practice, e.g., when heterogeneous data sources are integrated or data is taken from the World Wide Web. Views over semistructured data can be used to filter the data and to restructure (or provide structure to) it. To achieve fast query response time, these views are often materialized. This paper studies incremental maintenance techniques for materialized views over semistructured data. We use the graph-based data model OEM and the query language Lorel, developed at Stanford, as the framework for our work. We propose a new algorithm that produces a set of queries that compute the changes to the view based upon a change to the source. We develop an analytic cost model and compare the cost of executing our incremental maintenance algorithm to that of recomputing the view. We show that for nearly all types of database updates, it is more efficient to apply our incremental maintenance algorithm to the view than to recompute the view from the database, even when there are thousands of such updates.}, author = {Abiteboul, S. and McHugh, J. and Rys, M. and Vassalos, V. and Wiener, J.}, booktitle = {24rd International Conference on Very Large Data Bases}, interhash = {b395f09383de5eb21d34ad8c2b39ab59}, intrahash = {32903b757b4b4d118c77f4aeac4b0d94}, month = aug, pages = {38--49}, publisher = {Morgan Kaufmann}, title = {Incremental Maintenance for Materialized Views over Semistructured Data}, url = {http://ilpubs.stanford.edu:8090/340/}, year = 1998 } @article{dean2008mapreduce, abstract = {MapReduce is a programming model and an associated implementation for processing and generating large datasets that is amenable to a broad variety of real-world tasks. Users specify the computation in terms of a map and a reduce function, and the underlying runtime system automatically parallelizes the computation across large-scale clusters of machines, handles machine failures, and schedules inter-machine communication to make efficient use of the network and disks. Programmers find the system easy to use: more than ten thousand distinct MapReduce programs have been implemented internally at Google over the past four years, and an average of one hundred thousand MapReduce jobs are executed on Google's clusters every day, processing a total of more than twenty petabytes of data per day.}, acmid = {1327492}, address = {New York, NY, USA}, author = {Dean, Jeffrey and Ghemawat, Sanjay}, doi = {10.1145/1327452.1327492}, interhash = {b8a00982bf087c8543855897b7362a04}, intrahash = {bff539224836d703c2d21141985fa1a3}, issn = {0001-0782}, issue_date = {January 2008}, journal = {Communications of the ACM}, month = jan, number = 1, numpages = {7}, pages = {107--113}, publisher = {ACM}, title = {MapReduce: simplified data processing on large clusters}, url = {http://doi.acm.org/10.1145/1327452.1327492}, volume = 51, year = 2008 } @article{clauset2009powerlaw, abstract = {Power-law distributions occur in many situations of scientific interest and have significant consequences for our understanding of natural and man-made phenomena. Unfortunately, the detection and characterization of power laws is complicated by the large fluctuations that occur in the tail of the distribution—the part of the distribution representing large but rare events—and by the difficulty of identifying the range over which power-law behavior holds. Commonly used methods for analyzing power-law data, such as least-squares fitting, can produce substantially inaccurate estimates of parameters for power-law distributions, and even in cases where such methods return accurate answers they are still unsatisfactory because they give no indication of whether the data obey a power law at all. Here we present a principled statistical framework for discerning and quantifying power-law behavior in empirical data. Our approach combines maximum-likelihood fitting methods with goodness-of-fit tests based on the Kolmogorov–Smirnov (KS) statistic and likelihood ratios. We evaluate the effectiveness of the approach with tests on synthetic data and give critical comparisons to previous approaches. We also apply the proposed methods to twenty-four real-world data sets from a range of different disciplines, each of which has been conjectured to follow a power-law distribution. In some cases we find these conjectures to be consistent with the data, while in others the power law is ruled out.}, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, doi = {10.1137/070710111}, interhash = {9ce8658af5a6358a758bfdb819f73394}, intrahash = {c0097d202655474b1db6811ddea03410}, issn = {0036-1445}, journal = {SIAM Review}, number = 4, pages = {661--703}, publisher = {SIAM}, title = {Power-Law Distributions in Empirical Data}, url = {http://link.aip.org/link/?SIR/51/661/1}, volume = 51, year = 2009 }