@inproceedings{pal2012information, abstract = {Often an interesting true value such as a stock price, sports score, or current temperature is only available via the observations of noisy and potentially conflicting sources. Several techniques have been proposed to reconcile these conflicts by computing a weighted consensus based on source reliabilities, but these techniques focus on static values. When the real-world entity evolves over time, the noisy sources can delay, or even miss, reporting some of the real-world updates. This temporal aspect introduces two key challenges for consensus-based approaches: (i) due to delays, the mapping between a source's noisy observation and the real-world update it observes is unknown, and (ii) missed updates may translate to missing values for the consensus problem, even if the mapping is known. To overcome these challenges, we propose a formal approach that models the history of updates of the real-world entity as a hidden semi-Markovian process (HSMM). The noisy sources are modeled as observations of the hidden state, but the mapping between a hidden state (i.e. real-world update) and the observation (i.e. source value) is unknown. We propose algorithms based on Gibbs Sampling and EM to jointly infer both the history of real-world updates as well as the unknown mapping between them and the source values. We demonstrate using experiments on real-world datasets how our history-based techniques improve upon history-agnostic consensus-based approaches.}, acmid = {2187943}, address = {New York, NY, USA}, author = {Pal, Aditya and Rastogi, Vibhor and Machanavajjhala, Ashwin and Bohannon, Philip}, booktitle = {Proceedings of the 21st international conference on World Wide Web}, doi = {10.1145/2187836.2187943}, interhash = {0dc613c467921aff444e1e33a9aeace4}, intrahash = {3d334b8a497f199c27d56629e000f3cc}, isbn = {978-1-4503-1229-5}, location = {Lyon, France}, numpages = {10}, pages = {789--798}, publisher = {ACM}, title = {Information integration over time in unreliable and uncertain environments}, url = {http://doi.acm.org/10.1145/2187836.2187943}, year = 2012 } @inproceedings{li2012linking, abstract = {In real-world, entities change dynamically and the changes are capture in two dimensions: time and space. For data sets that contain temporal records, where each record is associated with a time stamp and describes some aspects of a real-world entity at that particular time, we often wish to identify records that describe the same entity over time and so be able to enable interesting longitudinal data analysis. For data sets that contain geographically referenced data describing real-world entities at different locations (i.e., location entities), we wish to link those entities that belong to the same organization or network. However, existing record linkage techniques ignore additional evidence in temporal and spatial data and can fall short for these cases.

This proposal studies linking temporal and spatial records. For temporal record linkage, we apply time decay to capture the effect of elapsed time on entity value evolution, and propose clustering methods that consider time order of records in clustering. For linking location records, we distinguish between strong and weak evidence; for the former, we study core generation in presence of erroneous data, and then leverage the discovered strong evidence to make remaining decisions.}, acmid = {2213612}, address = {New York, NY, USA}, author = {Li, Pei}, booktitle = {Proceedings of the on SIGMOD/PODS 2012 PhD Symposium}, doi = {10.1145/2213598.2213612}, interhash = {91dc73d16f9ebbbaec416db6333aa2f5}, intrahash = {18b8c7bcd4398f5502225bca23158f6e}, isbn = {978-1-4503-1326-1}, location = {Scottsdale, Arizona, USA}, numpages = {6}, pages = {51--56}, publisher = {ACM}, title = {Linking records in dynamic world}, url = {http://doi.acm.org/10.1145/2213598.2213612}, year = 2012 } @article{li2011linking, abstract = {Many data sets contain temporal records over a long period of time; each record is associated with a time stamp and describes some aspects of a realworld entity at that particular time (e.g., author information in DBLP). In such cases, we often wish to identify records that describe the same entity over time and so be able to enable interesting longitudinal data analysis. However, existing record linkage techniques ignore the temporal information and can fall short for temporal data. This paper studies linking temporal records. First, we apply time decay to capture the effect of elapsed time on entity value evolution. Second, instead of comparing each pair of records locally, we propose clustering methods that consider time order of the records and make global decisions. Experimental results show that our algorithms significantly outperform traditional linkage methods on various temporal data sets.}, author = {Li, P. and Luna Dong, X. and Maurino, A. and Srivastava, D.}, interhash = {0d8151346fd512743809aa0cfe591955}, intrahash = {85be46ab943802120277be8f8b6b264b}, issn = {2150-8097}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 11, pages = {956--967}, title = {Linking Temporal Records}, url = {http://hdl.handle.net/10281/28587}, volume = 4, year = 2011 }