@article{bechhofer2013linked, abstract = {Scientific data represents a significant portion of the linked open data cloud and scientists stand to benefit from the data fusion capability this will afford. Publishing linked data into the cloud, however, does not ensure the required reusability. Publishing has requirements of provenance, quality, credit, attribution and methods to provide the reproducibility that enables validation of results. In this paper we make the case for a scientific data publication model on top of linked data and introduce the notion of Research Objects as first class citizens for sharing and publishing.}, author = {Bechhofer, Sean and Buchan, Iain and De Roure, David and Missier, Paolo and Ainsworth, John and Bhagat, Jiten and Couch, Philip and Cruickshank, Don and Delderfield, Mark and Dunlop, Ian and Gamble, Matthew and Michaelides, Danius and Owen, Stuart and Newman, David and Sufi, Shoaib and Goble, Carole}, doi = {10.1016/j.future.2011.08.004}, interhash = {8df8b7069a622aa2eae6d74e5fdc0a6b}, intrahash = {f500b67a045765125183e23c827991d2}, issn = {0167-739X}, journal = {Future Generation Computer Systems}, number = 2, pages = {599--611}, title = {Why linked data is not enough for scientists}, url = {http://www.sciencedirect.com/science/article/pii/S0167739X11001439}, volume = 29, year = 2013 } @inproceedings{vandesompel2010httpbased, abstract = {Dereferencing a URI returns a representation of the current state of the resource identified by that URI. But, on the Web representations of prior states of a resource are also available, for example, as resource versions in Content Management Systems or archival resources in Web Archives such as the Internet Archive. This paper introduces a resource versioning mechanism that is fully based on HTTP and uses datetime as a global version indicator. The approach allows "follow your nose" style navigation both from the current time-generic resource to associated time-specific version resources as well as among version resources. The proposed versioning mechanism is congruent with the Architecture of the World Wide Web, and is based on the Memento framework that extends HTTP with transparent content negotiation in the datetime dimension. The paper shows how the versioning approach applies to Linked Data, and by means of a demonstrator built for DBpedia, it also illustrates how it can be used to conduct a time-series analysis across versions of Linked Data descriptions.}, author = {Van de Sompel, Herbert and Sanderson, Robert and Nelson, Michael L. and Balakireva, Lyudmila L. and Shankar, Harihar and Ainsworth, Scott}, booktitle = {Proceedings of Linked Data on the Web (LDOW2010)}, interhash = {0c517e7799d2c2da3f9b2a0daff27885}, intrahash = {8f9405e8056dd827d9c72a48e229a65a}, number = {1003.3661}, publisher = {arXiv}, series = {cs.DL}, title = {An HTTP-Based Versioning Mechanism for Linked Data}, url = {http://arxiv.org/abs/1003.3661}, year = 2010 } @incollection{rula2012diversity, abstract = {An increasing amount of data is published and consumed on the Web according to the Linked Data paradigm. In consideration of both publishers and consumers, the temporal dimension of data is important. In this paper we investigate the characterisation and availability of temporal information in Linked Data at large scale. Based on an abstract definition of temporal information we conduct experiments to evaluate the availability of such information using the data from the 2011 Billion Triple Challenge (BTC) dataset. Focusing in particular on the representation of temporal meta-information, i.e., temporal information associated with RDF statements and graphs, we investigate the approaches proposed in the literature, performing both a quantitative and a qualitative analysis and proposing guidelines for data consumers and publishers. Our experiments show that the amount of temporal information available in the LOD cloud is still very small; several different models have been used on different datasets, with a prevalence of approaches based on the annotation of RDF documents.}, address = {Berlin/Heidelberg}, author = {Rula, Anisa and Palmonari, Matteo and Harth, Andreas and Stadtmüller, Steffen and Maurino, Andrea}, booktitle = {The Semantic Web – ISWC 2012}, doi = {10.1007/978-3-642-35176-1_31}, editor = {Cudré-Mauroux, Philippe and Heflin, Jeff and Sirin, Evren and Tudorache, Tania and Euzenat, Jérôme and Hauswirth, Manfred and Parreira, JosianeXavier and Hendler, Jim and Schreiber, Guus and Bernstein, Abraham and Blomqvist, Eva}, interhash = {ea17ab98217d3ed32b06425a83fb25ab}, intrahash = {2bf73337f9b2ca5abc5e07d1ee48cc30}, isbn = {978-3-642-35175-4}, pages = {492--507}, publisher = {Springer }, series = {Lecture Notes in Computer Science}, title = {On the Diversity and Availability of Temporal Information in Linked Open Data}, url = {http://dx.doi.org/10.1007/978-3-642-35176-1_31}, volume = 7649, year = 2012 } @article{bernerslee2013readwrite, abstract = {This paper discusses issues that will affect the future development of the Web, either increasing its power and utility, or alternatively suppressing its development. It argues for the importance of the continued development of the Linked Data Web, and describes the use of linked open data as an important component of that. Second, the paper defends the Web as a read–write medium, and goes on to consider how the read–write Linked Data Web could be achieved.}, author = {Berners-Lee, Tim and O’Hara, Kieron}, doi = {10.1098/rsta.2012.0513}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120513.full.pdf+html}, interhash = {d7441404d63f5e6303e1c17f0aa27a8c}, intrahash = {9ec5e708342fac1e2ea2726cb7e2acd8}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, number = 1987, title = {The read–write Linked Data Web}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120513.abstract}, volume = 371, year = 2013 } @article{karger2013standards, abstract = {The evolving Web has seen ever-growing use of structured data, thanks to the way it enhances information authoring, querying, visualization and sharing. To date, however, most structured data authoring and management tools have been oriented towards programmers and Web developers. End users have been left behind, unable to leverage structured data for information management and communication as well as professionals. In this paper, I will argue that many of the benefits of structured data management can be provided to end users as well. I will describe an approach and tools that allow end users to define their own schemas (without knowing what a schema is), manage data and author (not program) interactive Web visualizations of that data using the Web tools with which they are already familiar, such as plain Web pages, blogs, wikis and WYSIWYG document editors. I will describe our experience deploying these tools and some lessons relevant to their future evolution.}, author = {Karger, David}, doi = {10.1098/rsta.2012.0381}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120381.full.pdf+html}, interhash = {587a510fb2d55abda118fc8e08309e4c}, intrahash = {90d25a4bcdb5dcd12190f8823f086a02}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, month = mar, number = 1987, title = {Standards opportunities around data-bearing Web pages}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120381.abstract}, volume = 371, year = 2013 } @article{bizer2009dbpedia, abstract = {The DBpedia project is a community effort to extract structured information from Wikipedia and to make this information accessible on the Web. The resulting DBpedia knowledge base currently describes over 2.6 million entities. For each of these entities, DBpedia defines a globally unique identifier that can be dereferenced over the Web into a rich RDF description of the entity, including human-readable definitions in 30 languages, relationships to other resources, classifications in four concept hierarchies, various facts as well as data-level links to other Web data sources describing the entity. Over the last year, an increasing number of data publishers have begun to set data-level links to DBpedia resources, making DBpedia a central interlinking hub for the emerging Web of Data. Currently, the Web of interlinked data sources around DBpedia provides approximately 4.7 billion pieces of information and covers domains such as geographic information, people, companies, films, music, genes, drugs, books, and scientific publications. This article describes the extraction of the DBpedia knowledge base, the current status of interlinking DBpedia with other data sources on the Web, and gives an overview of applications that facilitate the Web of Data around DBpedia.}, author = {Bizer, Christian and Lehmann, Jens and Kobilarov, Georgi and Auer, Sören and Becker, Christian and Cyganiak, Richard and Hellmann, Sebastian}, doi = {10.1016/j.websem.2009.07.002}, interhash = {087f766f30469cbc881c83ad156a104a}, intrahash = {560097dc36a8e66b69db5cb22c1fa334}, issn = {1570-8268}, journal = {Web Semantics: Science, Services and Agents on the World Wide Web}, number = 3, pages = {154--165}, title = {DBpedia - A crystallization point for the Web of Data}, url = {http://www.sciencedirect.com/science/article/pii/S1570826809000225}, volume = 7, year = 2009 } @inproceedings{suchanek2007semantic, abstract = {We present YAGO, a light-weight and extensible ontology with high coverage and quality. YAGO builds on entities and relations and currently contains more than 1 million entities and 5 million facts. This includes the Is-A hierarchy as well as non-taxonomic relations between entities (such as HASONEPRIZE). The facts have been automatically extracted from Wikipedia and unified with WordNet, using a carefully designed combination of rule-based and heuristic methods described in this paper. The resulting knowledge base is a major step beyond WordNet: in quality by adding knowledge about individuals like persons, organizations, products, etc. with their semantic relationships - and in quantity by increasing the number of facts by more than an order of magnitude. Our empirical evaluation of fact correctness shows an accuracy of about 95%. YAGO is based on a logically clean model, which is decidable, extensible, and compatible with RDFS. Finally, we show how YAGO can be further extended by state-of-the-art information extraction techniques.}, acmid = {1242667}, address = {New York, NY, USA}, author = {Suchanek, Fabian M. and Kasneci, Gjergji and Weikum, Gerhard}, booktitle = {Proceedings of the 16th international conference on World Wide Web}, doi = {10.1145/1242572.1242667}, interhash = {1d2c2b23ce2a6754d12c4364e19c574c}, intrahash = {84ae693c0a6dfb6d4b051b0b6dbd3668}, isbn = {978-1-59593-654-7}, location = {Banff, Alberta, Canada}, numpages = {10}, pages = {697--706}, publisher = {ACM}, title = {YAGO: a core of semantic knowledge}, url = {http://doi.acm.org/10.1145/1242572.1242667}, year = 2007 } @incollection{auer2007dbpedia, abstract = {DBpedia is a community effort to extract structured information from Wikipedia and to make this information available on the Web. DBpedia allows you to ask sophisticated queries against datasets derived from Wikipedia and to link other datasets on the Web to Wikipedia data. We describe the extraction of the DBpedia datasets, and how the resulting information is published on the Web for human- and machine-consumption. We describe some emerging applications from the DBpedia community and show how website authors can facilitate DBpedia content within their sites. Finally, we present the current status of interlinking DBpedia with other open datasets on the Web and outline how DBpedia could serve as a nucleus for an emerging Web of open data.}, address = {Berlin/Heidelberg}, author = {Auer, Sören and Bizer, Christian and Kobilarov, Georgi and Lehmann, Jens and Cyganiak, Richard and Ives, Zachary}, booktitle = {The Semantic Web}, doi = {10.1007/978-3-540-76298-0_52}, editor = {Aberer, Karl and Choi, Key-Sun and Noy, Natasha and Allemang, Dean and Lee, Kyung-Il and Nixon, Lyndon and Golbeck, Jennifer and Mika, Peter and Maynard, Diana and Mizoguchi, Riichiro and Schreiber, Guus and Cudré-Mauroux, Philippe}, interhash = {ba9f8a17de78f7864934ddb96afa67df}, intrahash = {b00f9f95ba1970164ad70aa227719c6e}, isbn = {978-3-540-76297-3}, pages = {722--735}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {DBpedia: A Nucleus for a Web of Open Data}, url = {http://dx.doi.org/10.1007/978-3-540-76298-0_52}, volume = 4825, year = 2007 } @inproceedings{pereiranunes2012entities, abstract = {The richness of the (Semantic) Web lies in its ability to link related resources as well as data across the Web. However, while relations within particular datasets are often well defined, links between disparate datasets and corpora of Web resources are rare. The increasingly widespread use of cross-domain reference datasets, such as Freebase and DBpedia for annotating and enriching datasets as well as document corpora, opens up opportunities to exploit their inherent semantics to uncover semantic relationships between disparate resources. In this paper, we present an approach to uncover relationships between disparate entities by analyzing the graphs of used reference datasets. We adapt a relationship assessment methodology from social network theory to measure the connectivity between entities in reference datasets and exploit these measures to identify correlated Web resources. Finally, we present an evaluation of our approach using the publicly available datasets Bibsonomy and USAToday. }, author = {Pereira Nunes, Bernardo and Kawase, Ricardo and Dietze, Stefan and Taibi, Davide and Casanova, Marco Antonio and Nejdl, Wolfgang}, booktitle = {Proceedings of the Web of Linked Entities Workshop in conjuction with the 11th International Semantic Web Conference}, editor = {Rizzo, Giuseppe and Mendes, Pablo and Charton, Eric and Hellmann, Sebastian and Kalyanpur, Aditya}, institution = {Bernardo Pereira Nunes, Ricardo Kawase, Stefan Dietze, Davide Taibi, Marco Antonio Casanova, Wolfgang Nejdl}, interhash = {8f969b917268449792c130dcbab06e69}, intrahash = {f22943239296ada0dfa11c30c5b4904a}, issb = {1613-0073}, month = nov, pages = {45--57}, series = {CEUR-WS.org}, title = {Can Entities be Friends?}, url = {http://ceur-ws.org/Vol-906/paper6.pdf}, urn = {urn:nbn:de:0074-906-7}, volume = 906, year = 2012 } @inproceedings{joachims2002optimizing, abstract = {This paper presents an approach to automatically optimizing the retrieval quality of search engines using clickthrough data. Intuitively, a good information retrieval system should present relevant documents high in the ranking, with less relevant documents following below. While previous approaches to learning retrieval functions from examples exist, they typically require training data generated from relevance judgments by experts. This makes them difficult and expensive to apply. The goal of this paper is to develop a method that utilizes clickthrough data for training, namely the query-log of the search engine in connection with the log of links the users clicked on in the presented ranking. Such clickthrough data is available in abundance and can be recorded at very low cost. Taking a Support Vector Machine (SVM) approach, this paper presents a method for learning retrieval functions. From a theoretical perspective, this method is shown to be well-founded in a risk minimization framework. Furthermore, it is shown to be feasible even for large sets of queries and features. The theoretical results are verified in a controlled experiment. It shows that the method can effectively adapt the retrieval function of a meta-search engine to a particular group of users, outperforming Google in terms of retrieval quality after only a couple of hundred training examples.}, acmid = {775067}, address = {New York, NY, USA}, author = {Joachims, Thorsten}, booktitle = {Proceedings of the eighth ACM SIGKDD international conference on Knowledge discovery and data mining}, doi = {10.1145/775047.775067}, interhash = {c78df69370bbf12636eaa5233b1fba83}, intrahash = {656a83f1057c5792506d0d656ae81d26}, isbn = {1-58113-567-X}, location = {Edmonton, Alberta, Canada}, numpages = {10}, pages = {133--142}, publisher = {ACM}, title = {Optimizing search engines using clickthrough data}, url = {http://doi.acm.org/10.1145/775047.775067}, year = 2002 } @inproceedings{joachims2005accurately, abstract = {This paper examines the reliability of implicit feedback generated from clickthrough data in WWW search. Analyzing the users' decision process using eyetracking and comparing implicit feedback against manual relevance judgments, we conclude that clicks are informative but biased. While this makes the interpretation of clicks as absolute relevance judgments difficult, we show that relative preferences derived from clicks are reasonably accurate on average.}, acmid = {1076063}, address = {New York, NY, USA}, author = {Joachims, Thorsten and Granka, Laura and Pan, Bing and Hembrooke, Helene and Gay, Geri}, booktitle = {Proceedings of the 28th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1076034.1076063}, interhash = {050982b76855a6b1258ed0b40cb69018}, intrahash = {8c488477626fa59db419ac77f3552029}, isbn = {1-59593-034-5}, location = {Salvador, Brazil}, numpages = {8}, pages = {154--161}, publisher = {ACM}, title = {Accurately interpreting clickthrough data as implicit feedback}, url = {http://doi.acm.org/10.1145/1076034.1076063}, year = 2005 } @inproceedings{martins2008extracting, abstract = {Geo-temporal criteria are important for filtering, grouping and prioritizing information resources. This presents techniques for extracting semantic geo-temporal information from text, using simple text mining methods that leverage on a gazetteer. A prototype system, implementing the proposed methods and capable of displaying information over maps and timelines, is described. This prototype can take input in RSS, demonstrating the application to content from many different online sources. Experimental results demonstrate the efficiency and accuracy of the proposed approaches.}, author = {Martins, B. and Manguinhas, H. and Borbinha, J.}, booktitle = {Proceedings of the International Conference on Semantic Computing}, doi = {10.1109/ICSC.2008.86}, interhash = {d03fecb6b3261ffa0a5e11789b188883}, intrahash = {5a889bc7d9e81cb1d294cb83b767bf64}, month = aug, pages = {1--9}, publisher = {IEEE Computer Society}, title = {Extracting and Exploring the Geo-Temporal Semantics of Textual Resources}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4597167}, year = 2008 } @article{goodwin2008geographical, abstract = {Ordnance Survey, the national mapping agency of Great Britain, is investigating how semantic web technologies assist its role as a geographical information provider. A major part of this work involves the development of prototype products and datasets in RDF. This article discusses the production of an example dataset for the administrative geography of Great Britain, demonstrating the advantages of explicitly encoding topological relations between geographic entities over traditional spatial queries. We also outline how these data can be linked to other datasets on the web of linked data and some of the challenges that this raises.}, author = {Goodwin, John and Dolbear, Catherine and Hart, Glen}, doi = {10.1111/j.1467-9671.2008.01133.x}, interhash = {ea248d549690eceb8e7aa06ccb24e226}, intrahash = {08412bb4afca1e86d0cca0a8a083f2a2}, issn = {1467-9671}, journal = {Transactions in GIS}, pages = {19--30}, publisher = {Blackwell Publishing Ltd}, title = {Geographical Linked Data: The Administrative Geography of Great Britain on the Semantic Web}, url = {http://dx.doi.org/10.1111/j.1467-9671.2008.01133.x}, volume = 12, year = 2008 } @article{liu2012crowdsourcing, abstract = {Some complex problems, such as image tagging and natural language processing, are very challenging for computers, where even state-of-the-art technology is yet able to provide satisfactory accuracy. Therefore, rather than relying solely on developing new and better algorithms to handle such tasks, we look to the crowdsourcing solution -- employing human participation -- to make good the shortfall in current technology. Crowdsourcing is a good supplement to many computer tasks. A complex job may be divided into computer-oriented tasks and human-oriented tasks, which are then assigned to machines and humans respectively.

To leverage the power of crowdsourcing, we design and implement a Crowdsourcing Data Analytics System, CDAS. CDAS is a framework designed to support the deployment of various crowdsourcing applications. The core part of CDAS is a quality-sensitive answering model, which guides the crowdsourcing engine to process and monitor the human tasks. In this paper, we introduce the principles of our quality-sensitive model. To satisfy user required accuracy, the model guides the crowdsourcing query engine for the design and processing of the corresponding crowdsourcing jobs. It provides an estimated accuracy for each generated result based on the human workers' historical performances. When verifying the quality of the result, the model employs an online strategy to reduce waiting time. To show the effectiveness of the model, we implement and deploy two analytics jobs on CDAS, a twitter sentiment analytics job and an image tagging job. We use real Twitter and Flickr data as our queries respectively. We compare our approaches with state-of-the-art classification and image annotation techniques. The results show that the human-assisted methods can indeed achieve a much higher accuracy. By embedding the quality-sensitive model into crowdsourcing query engine, we effectively reduce the processing cost while maintaining the required query answer quality.}, acmid = {2336676}, author = {Liu, Xuan and Lu, Meiyu and Ooi, Beng Chin and Shen, Yanyan and Wu, Sai and Zhang, Meihui}, interhash = {41ad6e73b03373d76d3164ba248335d7}, intrahash = {2091967734f96c4afbc09319d48a8c65}, issn = {2150-8097}, issue_date = {June 2012}, journal = {Proceedings of the VLDB Endowment}, month = jun, number = 10, numpages = {12}, pages = {1040--1051}, publisher = {VLDB Endowment}, title = {CDAS: a crowdsourcing data analytics system}, url = {http://dl.acm.org/citation.cfm?id=2336664.2336676}, volume = 5, year = 2012 } @article{muniswamyreddy2010provenance, abstract = {Digital provenance is meta-data that describes the ancestry or history of a digital object. Most work on provenance focuses on how provenance increases the value of data to consumers. However, provenance is also valuable to storage providers. For example, provenance can provide hints on access patterns, detect anomalous behavior, and provide enhanced user search capabilities. As the next generation storage providers, cloud vendors are in the unique position to capitalize on this opportunity to incorporate provenance as a fundamental storage system primitive. To date, cloud offerings have not yet done so. We provide motivation for providers to treat provenance as first class data in the cloud and based on our experience with provenance in a local storage system, suggest a set of requirements that make provenance feasible and attractive.}, acmid = {1713258}, address = {New York, NY, USA}, author = {Muniswamy-Reddy, Kiran-Kumar and Seltzer, Margo}, doi = {10.1145/1713254.1713258}, interhash = {6fb5af3426b91f7e460d99746b3358c2}, intrahash = {1f9f6761cab2437739d30b9636ba5531}, issn = {0163-5980}, issue_date = {January 2010}, journal = {SIGOPS Operating Systems Review}, month = jan, number = 4, numpages = {6}, pages = {11--16}, publisher = {ACM}, title = {Provenance as first class cloud data}, url = {http://doi.acm.org/10.1145/1713254.1713258}, volume = 43, year = 2010 } @article{alsubaiee2012asterix, abstract = {At UC Irvine, we are building a next generation parallel database system, called ASTERIX, as our approach to addressing today's "Big Data" management challenges. ASTERIX aims to combine time-tested principles from parallel database systems with those of the Web-scale computing community, such as fault tolerance for long running jobs. In this demo, we present a whirlwind tour of ASTERIX, highlighting a few of its key features. We will demonstrate examples of our data definition language to model semi-structured data, and examples of interesting queries using our declarative query language. In particular, we will show the capabilities of ASTERIX for answering geo-spatial queries and fuzzy queries, as well as ASTERIX' data feed construct for continuously ingesting data.}, acmid = {2367532}, author = {Alsubaiee, Sattam and Altowim, Yasser and Altwaijry, Hotham and Behm, Alexander and Borkar, Vinayak and Bu, Yingyi and Carey, Michael and Grover, Raman and Heilbron, Zachary and Kim, Young-Seok and Li, Chen and Onose, Nicola and Pirzadeh, Pouria and Vernica, Rares and Wen, Jian}, interhash = {ae521b66302adb1b7df3f4cdb8d92181}, intrahash = {003f2654ae41861cfb77bf0353634ac3}, issn = {2150-8097}, issue_date = {August 2012}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 12, numpages = {4}, pages = {1898--1901}, publisher = {VLDB Endowment}, title = {ASTERIX: an open source system for "Big Data" management and analysis (demo)}, url = {http://dl.acm.org/citation.cfm?id=2367502.2367532}, volume = 5, year = 2012 } @article{behm2011asterix, abstract = {ASTERIX is a new data-intensive storage and computing platform project spanning UC Irvine, UC Riverside, and UC San Diego. In this paper we provide an overview of the ASTERIX project, starting with its main goal—the storage and analysis of data pertaining to evolving-world models . We describe the requirements and associated challenges, and explain how the project is addressing them. We provide a technical overview of ASTERIX, covering its architecture, its user model for data and queries, and its approach to scalable query processing and data management. ASTERIX utilizes a new scalable runtime computational platform called Hyracks that is also discussed at an overview level; we have recently made Hyracks available in open source for use by other interested parties. We also relate our work on ASTERIX to the current state of the art and describe the research challenges that we are currently tackling as well as those that lie ahead.}, address = {Netherlands}, affiliation = {University of California, Irvine, USA}, author = {Behm, Alexander and Borkar, Vinayak and Carey, Michael and Grover, Raman and Li, Chen and Onose, Nicola and Vernica, Rares and Deutsch, Alin and Papakonstantinou, Yannis and Tsotras, Vassilis}, doi = {10.1007/s10619-011-7082-y}, interhash = {3e06363406f716c5d9340dc2c693adb3}, intrahash = {42d96cc4877943527a9259424c584740}, issn = {0926-8782}, journal = {Distributed and Parallel Databases}, keyword = {Computer Science}, number = 3, pages = {185--216}, publisher = {Springer}, title = {ASTERIX: towards a scalable, semistructured data platform for evolving-world models}, url = {http://dx.doi.org/10.1007/s10619-011-7082-y}, volume = 29, year = 2011 } @inproceedings{abiteboul1998incremental, abstract = {Semistructured data is not strictly typed like relational or object-oriented data and may be irregular or incomplete. It often arises in practice, e.g., when heterogeneous data sources are integrated or data is taken from the World Wide Web. Views over semistructured data can be used to filter the data and to restructure (or provide structure to) it. To achieve fast query response time, these views are often materialized. This paper studies incremental maintenance techniques for materialized views over semistructured data. We use the graph-based data model OEM and the query language Lorel, developed at Stanford, as the framework for our work. We propose a new algorithm that produces a set of queries that compute the changes to the view based upon a change to the source. We develop an analytic cost model and compare the cost of executing our incremental maintenance algorithm to that of recomputing the view. We show that for nearly all types of database updates, it is more efficient to apply our incremental maintenance algorithm to the view than to recompute the view from the database, even when there are thousands of such updates.}, author = {Abiteboul, S. and McHugh, J. and Rys, M. and Vassalos, V. and Wiener, J.}, booktitle = {24rd International Conference on Very Large Data Bases}, interhash = {b395f09383de5eb21d34ad8c2b39ab59}, intrahash = {32903b757b4b4d118c77f4aeac4b0d94}, month = aug, pages = {38--49}, publisher = {Morgan Kaufmann}, title = {Incremental Maintenance for Materialized Views over Semistructured Data}, url = {http://ilpubs.stanford.edu:8090/340/}, year = 1998 } @article{dean2008mapreduce, abstract = {MapReduce is a programming model and an associated implementation for processing and generating large datasets that is amenable to a broad variety of real-world tasks. Users specify the computation in terms of a map and a reduce function, and the underlying runtime system automatically parallelizes the computation across large-scale clusters of machines, handles machine failures, and schedules inter-machine communication to make efficient use of the network and disks. Programmers find the system easy to use: more than ten thousand distinct MapReduce programs have been implemented internally at Google over the past four years, and an average of one hundred thousand MapReduce jobs are executed on Google's clusters every day, processing a total of more than twenty petabytes of data per day.}, acmid = {1327492}, address = {New York, NY, USA}, author = {Dean, Jeffrey and Ghemawat, Sanjay}, doi = {10.1145/1327452.1327492}, interhash = {b8a00982bf087c8543855897b7362a04}, intrahash = {bff539224836d703c2d21141985fa1a3}, issn = {0001-0782}, issue_date = {January 2008}, journal = {Communications of the ACM}, month = jan, number = 1, numpages = {7}, pages = {107--113}, publisher = {ACM}, title = {MapReduce: simplified data processing on large clusters}, url = {http://doi.acm.org/10.1145/1327452.1327492}, volume = 51, year = 2008 } @article{clauset2009powerlaw, abstract = {Power-law distributions occur in many situations of scientific interest and have significant consequences for our understanding of natural and man-made phenomena. Unfortunately, the detection and characterization of power laws is complicated by the large fluctuations that occur in the tail of the distribution—the part of the distribution representing large but rare events—and by the difficulty of identifying the range over which power-law behavior holds. Commonly used methods for analyzing power-law data, such as least-squares fitting, can produce substantially inaccurate estimates of parameters for power-law distributions, and even in cases where such methods return accurate answers they are still unsatisfactory because they give no indication of whether the data obey a power law at all. Here we present a principled statistical framework for discerning and quantifying power-law behavior in empirical data. Our approach combines maximum-likelihood fitting methods with goodness-of-fit tests based on the Kolmogorov–Smirnov (KS) statistic and likelihood ratios. We evaluate the effectiveness of the approach with tests on synthetic data and give critical comparisons to previous approaches. We also apply the proposed methods to twenty-four real-world data sets from a range of different disciplines, each of which has been conjectured to follow a power-law distribution. In some cases we find these conjectures to be consistent with the data, while in others the power law is ruled out.}, author = {Clauset, Aaron and Shalizi, Cosma Rohilla and Newman, M. E. J.}, doi = {10.1137/070710111}, interhash = {9ce8658af5a6358a758bfdb819f73394}, intrahash = {c0097d202655474b1db6811ddea03410}, issn = {0036-1445}, journal = {SIAM Review}, number = 4, pages = {661--703}, publisher = {SIAM}, title = {Power-Law Distributions in Empirical Data}, url = {http://link.aip.org/link/?SIR/51/661/1}, volume = 51, year = 2009 } @inproceedings{tatti2006dimension, abstract = {Many 0/1 datasets have a very large number of variables; however, they are sparse and the dependency structure of the variables is simpler than the number of variables would suggest. Defining the effective dimensionality of such a dataset is a nontrivial problem. We consider the problem of defining a robust measure of dimension for 0/1 datasets, and show that the basic idea of fractal dimension can be adapted for binary data. However, as such the fractal dimension is difficult to interpret. Hence we introduce the concept of normalized fractal dimension. For a dataset D, its normalized fractal dimension counts the number of independent columns needed to achieve the unnormalized fractal dimension of D. The normalized fractal dimension measures the degree of dependency structure of the data. We study the properties of the normalized fractal dimension and discuss its computation. We give empirical results on the normalized fractal dimension, comparing it against PCA.}, author = {Tatti, N. and Mielikainen, T. and Gionis, A. and Mannila, H.}, booktitle = {Proceedings of the Sixth IEEE International Conference on Data Mining (ICDM 2006)}, doi = {10.1109/ICDM.2006.167}, interhash = {5164cd6a09b802d14dce6d3947df60cd}, intrahash = {0a8ad03bc7d2d0d7d77ee73eede4ecc0}, issn = {1550-4786}, month = dec, organization = {IEEE}, pages = {603--612}, title = {What is the Dimension of Your Binary Data?}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=4053086}, year = 2006 } @inproceedings{tramp2010weaving, abstract = {In this paper we tackle some of the most pressing obstacles of the emerging Linked Data Web, namely the quality, timeliness and coherence as well as direct end user benefits. We present an approach for complementing the Linked Data Web with a social dimension by extending the well-known Pingback mechanism, which is a technological cornerstone of the blogosphere, towards a Semantic Pingback. It is based on the advertising of an RPC service for propagating typed RDF links between Data Web resources. Semantic Pingback is downwards compatible with conventional Pingback implementations, thus allowing to connect and interlink resources on the Social Web with resources on the Data Web. We demonstrate its usefulness by showcasing use cases of the Semantic Pingback implementations in the semantic wiki OntoWiki and the Linked Data interface for database-backed Web applications Triplify. }, address = {Berlin / Heidelberg}, author = {Tramp, Sebastian and Frischmuth, Philipp and Ermilov, Timofey and Auer, Sören}, booktitle = {Proceedings of the EKAW 2010 - Knowledge Engineering and Knowledge Management by the Masses; 11th October-15th October 2010 - Lisbon, Portugal}, editor = {Cimiano, P. and Pinto, H.S.}, interhash = {c2e23e5a78627b560bb3103ecd52a410}, intrahash = {0e32b9750ee77fcdf2162f70aaee5622}, month = oct, pages = {135--149}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, timestamp = {2010.06.16}, title = {Weaving a Social Data Web with Semantic Pingback}, url = {http://svn.aksw.org/papers/2010/EKAW_SemanticPingback/public.pdf}, volume = 6317, year = 2010 } @book{tufte2001visual, asin = {0961392142}, author = {Tufte, Edward R.}, dewey = {001.4226}, ean = {9780961392147}, edition = {Second}, interhash = {9900880e451150c1b06ede3c780c062b}, intrahash = {9c028ebcb336380cb02e2a4beaa14d54}, isbn = {0961392142}, publisher = {Graphics Press}, title = {The Visual Display of Quantitative Information}, url = {http://www.amazon.com/Visual-Display-Quantitative-Information-2nd/dp/0961392142%3FSubscriptionId%3D192BW6DQ43CK9FN0ZGG2%26tag%3Dws%26linkCode%3Dxm2%26camp%3D2025%26creative%3D165953%26creativeASIN%3D0961392142}, year = 2001 } @incollection{fayyad1996data, abstract = {Data mining and knowledge discovery in databases have been attracting a significant amount of research, industry, and media attention of late. What is all the excitement about? This article provides an overview of this emerging field, clarifying how data mining and knowledge discovery in databases are related both to each other and to related fields, such as machine learning, statistics, and databases. The article mentions particular real-world applications, specific data-mining techniques, challenges involved in real-world applications of knowledge discovery, and current and future research directions in the field.}, address = {Menlo Park, CA, USA}, author = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic}, booktitle = {Advances in knowledge discovery and data mining}, editor = {Fayyad, Usama M. and Piatetsky-Shapiro, Gregory and Smyth, Padhraic and Uthurusamy, Ramasamy}, interhash = {79663e4b1f464b82ce1ae45345dc424f}, intrahash = {3f5a400d01a974f993cee1ac5f79cfc8}, isbn = {0-262-56097-6}, pages = {1--34}, publisher = {American Association for Artificial Intelligence}, title = {From data mining to knowledge discovery: an overview}, url = {http://portal.acm.org/citation.cfm?id=257942}, year = 1996 } @article{331504, abstract = {Clustering is the unsupervised classification of patterns (observations, data items, or feature vectors) into groups (clusters). The clustering problem has been addressed in many contexts and by researchers in many disciplines; this reflects its broad appeal and usefulness as one of the steps in exploratory data analysis. However, clustering is a difficult problem combinatorially, and differences in assumptions and contexts in different communities has made the transfer of useful generic concepts and methodologies slow to occur. This paper presents an overview of pattern clustering methods from a statistical pattern recognition perspective, with a goal of providing useful advice and references to fundamental concepts accessible to the broad community of clustering practitioners. We present a taxonomy of clustering techniques, and identify cross-cutting themes and recent advances. We also describe some important applications of clustering algorithms such as image segmentation, object recognition, and information retrieval.}, address = {New York, NY, USA}, author = {Jain, A. K. and Murty, M. N. and Flynn, P. J.}, doi = {http://doi.acm.org/10.1145/331499.331504}, interhash = {5113b61d428d4de4423182e5f2b2f468}, intrahash = {b19bcef82a04eb82ee4abde53ee7d1c2}, issn = {0360-0300}, journal = {ACM Comput. Surv.}, number = 3, pages = {264--323}, publisher = {ACM}, title = {Data clustering: a review}, url = {http://portal.acm.org/citation.cfm?id=331499.331504&coll=Portal&dl=ACM&CFID=26215063&CFTOKEN=18848029}, volume = 31, year = 1999 } @article{romero07, abstract = {Currently there is an increasing interest in data mining and educational systems, making educational data mining as a new growing research community. This paper surveys the application of data mining to traditional educational systems, particular web-based courses, well-known learning content management systems, and adaptive and intelligent web-based educational systems. Each of these systems has different data source and objectives for knowledge discovering. After preprocessing the available data in each case, data mining techniques can be applied: statistics and visualization; clustering, classification and outlier detection; association rule mining and pattern mining; and text mining. The success of the plentiful work needs much more specialized work in order for educational data mining to become a mature area.}, address = {Tarrytown, NY, USA}, author = {Romero, C. and Ventura, S.}, doi = {http://dx.doi.org/10.1016/j.eswa.2006.04.005}, interhash = {89d843f1a3b181f2a628e881d9210b22}, intrahash = {746d12e92e58587461ffcb8dc381e283}, issn = {0957-4174}, journal = {Expert Syst. Appl.}, number = 1, pages = {135--146}, publisher = {Pergamon Press, Inc.}, title = {Educational data mining: A survey from 1995 to 2005}, url = {http://portal.acm.org/citation.cfm?id=1223659}, volume = 33, year = 2007 } @article{wu2008wu, abstract = {This paper presents the top 10 data mining algorithms identified by the IEEE International Conference on Data Mining (ICDM) in December 2006: C4.5, k-Means, SVM, Apriori, EM, PageRank, AdaBoost, kNN, Naive Bayes, and CART. These top 10 algorithms are among the most influential data mining algorithms in the research community.With each algorithm, we provide a description of the algorithm, discuss the impact of the algorithm, and review current andfurther research on the algorithm. These 10 algorithms cover classification, clustering, statistical learning, associationanalysis, and link mining, which are all among the most important topics in data mining research and development.}, address = {London}, author = {Wu, Xindong and Kumar, Vipin and Quinlan, J. Ross and Ghosh, Joydeep and Yang, Qiang and Motoda, Hiroshi and McLachlan, Geoffrey and Ng, Angus and Liu, Bing and Yu, Philip and Zhou, Zhi-Hua and Steinbach, Michael and Hand, David and Steinberg, Dan}, interhash = {76fd294a34cf85638f6e194a85af8db9}, intrahash = {2c34bb4b49187a6d3e780e78d254ae1f}, issn = {0219-1377}, journal = {Knowledge and Information Systems}, month = Jan, number = 1, pages = {1--37}, publisher = {Springer}, title = {Top 10 algorithms in data mining}, url = {http://dx.doi.org/10.1007/s10115-007-0114-2}, volume = 14, year = 2008 } @inproceedings{conf/sdm/AggarwalY05, author = {Aggarwal, Charu C. and Yu, Philip S.}, booktitle = {SDM}, interhash = {e1487d660a1614b50bd756f7383b98ea}, intrahash = {bb72c8baa786e98565c4a7448ecae59a}, title = {Online Analysis of Community Evolution in Data Streams.}, url = {http://web.mit.edu/charu/www/aggar142.pdf }, year = 2005 }