@inproceedings{nivarthi2023towards,
  abstract = {Anomaly detection plays a pivotal role in diverse realworld applications such as cybersecurity, fault detection, network
monitoring, predictive maintenance, and highly automated driving. However, obtaining labeled anomalous data can be a formidable
challenge, especially when anomalies exhibit temporal evolution. This paper introduces LATAM (Long short-term memory Autoencoder with Temporal Attention Mechanism) for few-shot anomaly detection, with the aim of enhancing detection performance in scenarios with limited labeled anomaly data. LATAM effectively captures temporal dependencies and emphasizes significant patterns in multivariate time series data. In our investigation, we
comprehensively evaluate LATAM against other anomaly detection models, particularly assessing its capability in few-shot learning
scenarios where we have minimal examples from the normal class and none from the anomalous class in the training data. Our
experimental results, derived from real-world photovoltaic inverter data, highlight LATAM’s superiority, showcasing a substantial
27% mean F1 score improvement, even when trained on a mere two-week dataset. Furthermore, LATAM demonstrates remarkable
results on the open-source SWaT dataset, achieving a 12% boost in accuracy with only two days of training data. Moreover, we
introduce a simple yet effective dynamic thresholding mechanism, further enhancing the anomaly detection capabilities of LATAM.
This underscores LATAM’s efficacy in addressing the challenges posed by limited labeled anomalies in practical scenarios and it
proves valuable for downstream tasks involving temporal representation and time series prediction, extending its utility beyond
anomaly detection applications.},
  author = {Nivarthi, Chandana Priya and Sick, Bernhard},
  booktitle = {International Conference on Machine Learning and Applications (ICMLA)},
  doi = {10.1109/ICMLA58977.2023.00218},
  interhash = {2c7b944a23ce00dd5e4637ce2c572f31},
  intrahash = {a4a29acb67656f837ca6e532fc88958d},
  pages = {1444--1450},
  publisher = {IEEE},
  title = {Towards Few-Shot Time Series Anomaly Detection with Temporal Attention and Dynamic Thresholding},
  year = 2023
}

@article{Al_Mawla-AT-2018,
  abstract = {The formation of foam in amine units is an issue that plant operators and field personnel are confronted with on a regular basis. The inability to take proper actions in due time may result in plant downtime and increased emissions. Steep rises in differential pressure indicate foam formation, and are monitored manually in practice. Antifoaming agent is added in order to reduce foaming, but this is usually carried out under time pressure. Hence, plant operating authorities have expressed a strong interest in a data-driven solution capable of providing an early warning against foaming. The classical univariate alarm associated with differential pressure can be ineffective for foaming detection due to high misdetection rates and its lateness of detection. Modern univariate approaches based on pattern recognition techniques may not be suitable either for an early detection, as no universally distinctive features of differential pressure are observed prior to foaming in the present study. In this contribution, the multivariate statistical process monitoring approach based on principal component analysis (PCA) is applied to the early detection of foaming in a continuously operated Shell Claus Off-gas Treating (SCOT) unit of a major refinery in Germany. The results are extended to facilitate fully automated and adaptive modeling based on exponentially weighted recursive principal component analysis (EWRPCA).},
  author = {{Al Mawla}, H. and Kroll, A.},
  doi = {https://doi.org/10.1515/auto-2018-0048},
  interhash = {c71315c65656bf01f1664b41ecdaef35},
  intrahash = {4dbe292a781d9547d51a902a67f185eb},
  issn = {2196-677X},
  journal = {at -- Automatisierungstechnik},
  mrtnote = {FEE},
  number = 8,
  owner = {almawla},
  pages = {665 -- 679},
  title = {Multivariate statistical approaches for an early detection of foaming in a refinery SCOT unit},
  url = {https://www.degruyter.com/view/j/auto.2018.66.issue-8/auto-2018-0048/auto-2018-0048.xml},
  volume = 66,
  year = 2018
}

@inproceedings{pereiranunes2012entities,
  abstract = {The richness of the (Semantic) Web lies in its ability to link related resources as well as data across the Web. However, while relations within particular datasets are often well defined, links between disparate datasets and corpora of Web resources are rare. The increasingly widespread use of cross-domain reference datasets, such as Freebase and DBpedia for annotating and enriching datasets as well as document corpora, opens up opportunities to exploit their inherent semantics to uncover semantic relationships between disparate resources. In this paper, we present an approach to uncover relationships between disparate entities by analyzing the graphs of used reference datasets. We adapt a relationship assessment methodology from social network theory to measure the connectivity between entities in reference datasets and exploit these measures to identify correlated Web resources. Finally, we present an evaluation of our approach using the publicly available datasets Bibsonomy and USAToday. },
  author = {Pereira Nunes, Bernardo and Kawase, Ricardo and Dietze, Stefan and Taibi, Davide and Casanova, Marco Antonio and Nejdl, Wolfgang},
  booktitle = {Proceedings of the Web of Linked Entities Workshop in conjuction with the 11th International Semantic Web Conference},
  editor = {Rizzo, Giuseppe and Mendes, Pablo and Charton, Eric and Hellmann, Sebastian and Kalyanpur, Aditya},
  institution = {Bernardo Pereira Nunes, Ricardo Kawase, Stefan Dietze, Davide Taibi, Marco Antonio Casanova, Wolfgang Nejdl},
  interhash = {8f969b917268449792c130dcbab06e69},
  intrahash = {f22943239296ada0dfa11c30c5b4904a},
  issb = {1613-0073},
  month = nov,
  pages = {45--57},
  series = {CEUR-WS.org},
  title = {Can Entities be Friends?},
  url = {http://ceur-ws.org/Vol-906/paper6.pdf},
  urn = {urn:nbn:de:0074-906-7},
  volume = 906,
  year = 2012
}

@article{newman2004finding,
  author = {Newman, M. E. J. and Girvan, M.},
  doi = {10.1103/PhysRevE.69.026113},
  interhash = {b9145040e35ccb4d2a0ce18105e64ff4},
  intrahash = {1dbc30a1818aa74973f387162e485443},
  journal = {Phys. Rev. E},
  month = feb,
  number = 2,
  numpages = {15},
  pages = 026113,
  publisher = {American Physical Society},
  title = {Finding and evaluating community structure in networks},
  url = {http://link.aps.org/doi/10.1103/PhysRevE.69.026113},
  volume = 69,
  year = 2004
}

@article{fortunato2010community,
  abstract = {The modern science of networks has brought significant advances to our understanding of complex systems. One of the most relevant features of graphs representing real systems is community structure, or clustering, i.e. the organization of vertices in clusters, with many edges joining vertices of the same cluster and comparatively few edges joining vertices of different clusters. Such clusters, or communities, can be considered as fairly independent compartments of a graph, playing a similar role like, e.g., the tissues or the organs in the human body. Detecting communities is of great importance in sociology, biology and computer science, disciplines where systems are often represented as graphs. This problem is very hard and not yet satisfactorily solved, despite the huge effort of a large interdisciplinary community of scientists working on it over the past few years. We will attempt a thorough exposition of the topic, from the definition of the main elements of the problem, to the presentation of most methods developed, with a special focus on techniques designed by statistical physicists, from the discussion of crucial issues like the significance of clustering and how methods should be tested and compared against each other, to the description of applications to real networks. },
  author = {Fortunato, Santo},
  doi = {http://dx.doi.org/10.1016/j.physrep.2009.11.002},
  interhash = {9f6089e942903fc65309f77744c88109},
  intrahash = {fddddfb8990e8ea824c8c4b62244f737},
  issn = {0370-1573},
  journal = {Physics Reports },
  number = {3–5},
  pages = {75 - 174},
  title = {Community detection in graphs },
  url = {http://www.sciencedirect.com/science/article/pii/S0370157309002841},
  volume = 486,
  year = 2010
}

@article{Atzmueller:12c,
  author = {Atzmueller, Martin},
  interhash = {0b20c1d53d5df05326d594726273c2fb},
  intrahash = {7b616e64994893a2aad95b5ad95db662},
  journal = {WIREs: Data Mining and Knowledge Discovery},
  title = {{Mining Social Media: Key Players, Sentiments, and Communities}},
  volume = {In Press},
  year = 2012
}

@inproceedings{krause2008antisocial,
  abstract = {The annotation of web sites in social bookmarking systemshas become a popular way to manage and find informationon the web. The community structure of such systems attractsspammers: recent post pages, popular pages or specifictag pages can be manipulated easily. As a result, searchingor tracking recent posts does not deliver quality resultsannotated in the community, but rather unsolicited, oftencommercial, web sites. To retain the benefits of sharingone’s web content, spam-fighting mechanisms that can facethe flexible strategies of spammers need to be developed.},
  address = {New York, NY, USA},
  author = {Krause, Beate and Schmitz, Christoph and Hotho, Andreas and Stumme, Gerd},
  booktitle = {AIRWeb '08: Proceedings of the 4th International Workshop on Adversarial Information Retrieval on the Web},
  doi = {10.1145/1451983.1451998},
  file = {krause2008antisocial.pdf:krause2008antisocial.pdf:PDF},
  groups = {public},
  interhash = {a45d40ac7776551301ad9dde5b25357f},
  intrahash = {5b6b648fd25c15d594404ae26fcda6b4},
  isbn = {978-1-60558-159-0},
  location = {Beijing, China},
  month = apr,
  pages = {61--68},
  publisher = {ACM},
  title = {The Anti-Social Tagger - Detecting Spam in Social Bookmarking Systems},
  url = {http://airweb.cse.lehigh.edu/2008/submissions/krause_2008_anti_social_tagger.pdf},
  username = {dbenz},
  year = 2008
}

@inproceedings{pereiranunes2012entities,
  abstract = {The richness of the (Semantic) Web lies in its ability to link related resources as well as data across the Web. However, while relations within particular datasets are often well defined, links between disparate datasets and corpora of Web resources are rare. The increasingly widespread use of cross-domain reference datasets, such as Freebase and DBpedia for annotating and enriching datasets as well as document corpora, opens up opportunities to exploit their inherent semantics to uncover semantic relationships between disparate resources. In this paper, we present an approach to uncover relationships between disparate entities by analyzing the graphs of used reference datasets. We adapt a relationship assessment methodology from social network theory to measure the connectivity between entities in reference datasets and exploit these measures to identify correlated Web resources. Finally, we present an evaluation of our approach using the publicly available datasets Bibsonomy and USAToday. },
  author = {Pereira Nunes, Bernardo and Kawase, Ricardo and Dietze, Stefan and Taibi, Davide and Casanova, Marco Antonio and Nejdl, Wolfgang},
  booktitle = {Proceedings of the Web of Linked Entities Workshop in conjuction with the 11th International Semantic Web Conference},
  editor = {Rizzo, Giuseppe and Mendes, Pablo and Charton, Eric and Hellmann, Sebastian and Kalyanpur, Aditya},
  institution = {Bernardo Pereira Nunes, Ricardo Kawase, Stefan Dietze, Davide Taibi, Marco Antonio Casanova, Wolfgang Nejdl},
  interhash = {8f969b917268449792c130dcbab06e69},
  intrahash = {f22943239296ada0dfa11c30c5b4904a},
  issb = {1613-0073},
  month = nov,
  pages = {45--57},
  series = {CEUR-WS.org},
  title = {Can Entities be Friends?},
  url = {http://ceur-ws.org/Vol-906/paper6.pdf},
  urn = {urn:nbn:de:0074-906-7},
  volume = 906,
  year = 2012
}

@article{tejada2001learning,
  abstract = {When integrating information from multiple websites, the same data objects can exist in inconsistent text formats across sites, making it difficult to identify matching objects using exact text match. We have developed an object identification system called Active Atlas, which compares the objects’ shared attributes in order to identify matching objects. Certain attributes are more important for deciding if a mapping should exist between two objects. Previous methods of object identification have required manual construction of object identification rules or mapping rules for determining the mappings between objects. This manual process is time consuming and error-prone. In our approach. Active Atlas learns to tailor mapping rules, through limited user input, to a specific application domain. The experimental results demonstrate that we achieve higher accuracy and require less user involvement than previous methods across various application domains.},
  author = {Tejada, Sheila and Knoblock, Craig A and Minton, Steven},
  doi = {10.1016/S0306-4379(01)00042-4},
  interhash = {f9f59187b0397a0fbe1e558dfb4ad9cf},
  intrahash = {5ad46801d602408ce271276f452263a9},
  issn = {0306-4379},
  journal = {Information Systems},
  month = dec,
  number = 8,
  pages = {607--633},
  title = {Learning object identification rules for information integration},
  url = {http://www.sciencedirect.com/science/article/pii/S0306437901000424},
  volume = 26,
  year = 2001
}

@article{borges2011classification,
  abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. },
  author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata},
  editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria},
  interhash = {ca7720210214f632758211735154eea2},
  intrahash = {8f87206e413c2c632b5c633f484fcbe2},
  journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 },
  pages = {221--228},
  title = {A Classification-based Approach for Bibliographic Metadata Deduplication},
  url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf},
  year = 2011
}

@inproceedings{bullock2011privacyaware,
  abstract = {With the increased popularity of Web 2.0 services in the last years data privacy has become a major concern for users. The more personal data users reveal, the more difficult it becomes to control its disclosure in the web. However, for Web 2.0 service providers, the data provided by users is a valuable source for offering effective, personalised data mining services. One major application is the detection of spam in social bookmarking systems: in order to prevent a decrease of content quality, providers need to distinguish spammers and exclude them from the system. They thereby experience a conflict of interests: on the one hand, they need to identify spammers based on the information they collect about users, on the other hand, they need to respect privacy concerns and process as few personal data as possible. It would therefore be of tremendous help for system developers and users to know which personal data are needed for spam detection and which can be ignored. In this paper we address these questions by presenting a data privacy aware feature engineering approach. It consists of the design of features for spam classification which are evaluated according to both, performance and privacy conditions. Experiments using data from the social bookmarking system BibSonomy show that both conditions must not exclude each other.},
  acmid = {2024306},
  address = {New York, NY, USA},
  articleno = {15},
  author = {Bullock, Beate Navarro and Lerch, Hana and Ro\ssnagel, Alexander and Hotho, Andreas and Stumme, Gerd},
  booktitle = {Proceedings of the 11th International Conference on Knowledge Management and Knowledge Technologies},
  doi = {10.1145/2024288.2024306},
  interhash = {7a2d6a35c124ea0fe31c962f8f150916},
  intrahash = {00a8f31185a34957eb16d500d7d51398},
  isbn = {978-1-4503-0732-1},
  location = {Graz, Austria},
  numpages = {8},
  pages = {15:1--15:8},
  publisher = {ACM},
  series = {i-KNOW '11},
  title = {Privacy-aware spam detection in social bookmarking systems},
  url = {http://doi.acm.org/10.1145/2024288.2024306},
  year = 2011
}

@article{borges2011classificationbased,
  abstract = {Digital libraries of scientific articles describe them using a set of metadata, including bibliographic references. These references can be represented by several formats and styles. Considerable content variations can occur in some metadata fields such as title, author names and publication venue. Besides, it is quite common to find references that omit same metadata fields such as page numbers. Duplicate entries influence the quality of digital library services once they need to be appropriately identified and treated. This paper presents a comparative analysis among different data classification algorithms used to identify duplicated bibliographic metadata records. We have investigated the discovered patterns by comparing the rules and the decision tree with the heuristics adopted in a previous work. Our experiments show that the combination of specific-purpose similarity functions previously proposed and classification algorithms represent an improvement up to 12% when compared to the experiments using our original approach. },
  author = {Borges, Eduardo N. and Becker, Karin and Heuser, Carlos A. and Galante, Renata},
  editor = {White, Bebo and Isaías, Pedro and Santoro, Flávia Maria},
  interhash = {ca7720210214f632758211735154eea2},
  intrahash = {8f87206e413c2c632b5c633f484fcbe2},
  journal = {Proceedings of the IADIS International Conference WWW/Internet 2011 },
  pages = {221-228},
  title = {A Classification-based Approach for Bibliographic Metadata Deduplication},
  url = {http://www.eduardo.c3.furg.br/arquivos/download/www-internet2011.pdf},
  year = 2011
}

@inproceedings{Yang2006,
  author = {Yang, Hui and Callan, James P.},
  booktitle = {SIGIR},
  crossref = {conf/sigir/2006},
  editor = {Efthimiadis, Efthimis N. and Dumais, Susan T. and Hawking, David and Järvelin, Kalervo},
  ee = {http://doi.acm.org/10.1145/1148170.1148243},
  interhash = {0703044e3abd1580680e66f2355813c6},
  intrahash = {27e76ac1174db2a3ee4a3efd34bb2e16},
  isbn = {1-59593-369-7},
  pages = {421-428},
  publisher = {ACM},
  title = {Near-duplicate detection by instance-level constrained clustering.},
  url = {http://dblp.uni-trier.de/db/conf/sigir/sigir2006.html#YangC06},
  year = 2006
}

@article{cousins1998duplicate,
  abstract = {COPAC is a union catalog giving access to the online catalog records of some of the largest academic research libraries in the United Kingdom and Ireland. Discussion includes ways in which duplicate detection and record consolidation procedures are carried out, along with problem areas encountered. (Author/AEF)},
  author = {Cousins, Shirley Anne},
  interhash = {6880df322e69a00af4df1466c7730e7a},
  intrahash = {a1067917a86f9aaaa1d5610ae113436c},
  issn = {01655515},
  journal = {Journal of Information Science},
  number = 4,
  pages = {231--40},
  refid = {EJ573940},
  title = {Duplicate Detection and Record Consolidation in Large Bibliographic Databases: The COPAC Database Experience.},
  url = {http://www.eric.ed.gov/ERICWebPortal/detail?accno=EJ573940},
  volume = 24,
  year = 1998
}

@misc{kapidakis2008duplicate,
  author = {Sitas, Anestis and Kapidakis, Sarantos},
  interhash = {94c3f69a754778b492d725bb08ffc0fb},
  intrahash = {633b89b5a6827d28513545282f9f8bc7},
  journal = {Library Hi Tech},
  pages = {pp. 287-301},
  title = {Duplicate detection algorithms of bibliographic descriptions},
  url = {http://www.ionio.gr/~sarantos/repository/j21J-LibraryHiTech-Sitas.pdf},
  volume = {Vol. 26 Iss: 2},
  year = 2008
}

@inproceedings{bullock2011privacyaware,
  abstract = {With the increased popularity of Web 2.0 services in the last years data privacy has become a major concern for users. The more personal data users reveal, the more difficult it becomes to control its disclosure in the web. However, for Web 2.0 service providers, the data provided by users is a valuable source for offering effective, personalised data mining services. One major application is the detection of spam in social bookmarking systems: in order to prevent a decrease of content quality, providers need to distinguish spammers and exclude them from the system. They thereby experience a conflict of interests: on the one hand, they need to identify spammers based on the information they collect about users, on the other hand, they need to respect privacy concerns and process as few personal data as possible. It would therefore be of tremendous help for system developers and users to know which personal data are needed for spam detection and which can be ignored. In this paper we address these questions by presenting a data privacy aware feature engineering approach. It consists of the design of features for spam classification which are evaluated according to both, performance and privacy conditions. Experiments using data from the social bookmarking system BibSonomy show that both conditions must not exclude each other.},
  acmid = {2024306},
  address = {New York, NY, USA},
  articleno = {15},
  author = {Bullock, Beate Navarro and Lerch, Hana and Ro\ssnagel, Alexander and Hotho, Andreas and Stumme, Gerd},
  booktitle = {Proceedings of the 11th International Conference on Knowledge Management and Knowledge Technologies},
  doi = {10.1145/2024288.2024306},
  interhash = {7a2d6a35c124ea0fe31c962f8f150916},
  intrahash = {00a8f31185a34957eb16d500d7d51398},
  isbn = {978-1-4503-0732-1},
  location = {Graz, Austria},
  numpages = {8},
  pages = {15:1--15:8},
  publisher = {ACM},
  series = {i-KNOW '11},
  title = {Privacy-aware spam detection in social bookmarking systems},
  url = {http://doi.acm.org/10.1145/2024288.2024306},
  year = 2011
}

@inproceedings{jaeschke2006wege,
  abstract = {Ein wichtiger Baustein des neu entdeckten World Wide Web -- des "`Web 2.0"'  -- stellen
Folksonomies dar. In diesen Systemen können Benutzer gemeinsam Ressourcen verwalten und
mit Schlagwörtern versehen. Die dadurch entstehenden begrifflichen Strukturen stellen
ein interessantes Forschungsfeld dar. Dieser Artikel untersucht Ansätze und Wege zur 
Entdeckung und Strukturierung von Nutzergruppen ("Communities") in Folksonomies.},
  address = {Halle-Wittenberg},
  author = {Jäschke, Robert and Hotho, Andreas and Schmitz, Christoph and Stumme, Gerd},
  booktitle = {Proc. 18. Workshop Grundlagen von Datenbanken},
  editor = {Braß, Stefan and Hinneburg, Alexander},
  interhash = {59224b5889a24108434a9b5ecc6b0887},
  intrahash = {2b6be3bd5daee7119973fcf69909956f},
  month = {June},
  pages = {80-84},
  publisher = {Martin-Luther-Universität },
  title = {Wege zur Entdeckung von Communities in Folksonomies},
  url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2006/jaeschke2006wege.pdf},
  year = 2006
}

@inproceedings{hotho2006trend,
  abstract = {As the number of resources on the web exceeds by far the number ofdocuments one can track, it becomes increasingly difficult to remainup to date on ones own areas of interest. The problem becomes moresevere with the increasing fraction of multimedia data, from whichit is difficult to extract some conceptual description of theircontents.One way to overcome this problem are social bookmark tools, whichare rapidly emerging on the web. In such systems, users are settingup lightweight conceptual structures called folksonomies, andovercome thus the knowledge acquisition bottleneck. As more and morepeople participate in the effort, the use of a common vocabularybecomes more and more stable. We present an approach for discoveringtopic-specific trends within folksonomies. It is based on adifferential adaptation of the PageRank algorithm to the triadichypergraph structure of a folksonomy. The approach allows for anykind of data, as it does not rely on the internal structure of thedocuments. In particular, this allows to consider different datatypes in the same analysis step. We run experiments on a large-scalereal-world snapshot of a social bookmarking system.},
  address = {Heidelberg},
  author = {Hotho, Andreas and Jäschke, Robert and Schmitz, Christoph and Stumme, Gerd},
  booktitle = {Proc. First International Conference on Semantics And Digital Media Technology (SAMT) },
  editor = {Avrithis, Yannis S. and Kompatsiaris, Yiannis and Staab, Steffen and O'Connor, Noel E.},
  ee = {http://dx.doi.org/10.1007/11930334_5},
  interhash = {227be738c5cea57530d592463fd09abd},
  intrahash = {42cda5911e901eadd0ac6a106a6aa1dc},
  isbn = {3-540-49335-2},
  month = {December},
  pages = {56-70},
  publisher = {Springer},
  series = {LNCS},
  title = {Trend Detection in Folksonomies},
  url = {http://www.kde.cs.uni-kassel.de/stumme/papers/2006/hotho2006trend.pdf},
  vgwort = {27},
  volume = 4306,
  year = 2006
}

@misc{Sarma2011,
  abstract = {  De-duplication---identification of distinct records referring to the same real-world entity---is a well-known challenge in data integration. Since very large datasets prohibit the comparison of every pair of records, {\em blocking} has been identified as a technique of dividing the dataset for pairwise comparisons, thereby trading off {\em recall} of identified duplicates for {\em efficiency}. Traditional de-duplication tasks, while challenging, typically involved a fixed schema such as Census data or medical records. However, with the presence of large, diverse sets of structured data on the web and the need to organize it effectively on content portals, de-duplication systems need to scale in a new dimension to handle a large number of schemas, tasks and data sets, while handling ever larger problem sizes. In addition, when working in a map-reduce framework it is important that canopy formation be implemented as a {\em hash function}, making the canopy design problem more challenging. We present CBLOCK, a system that addresses these challenges. CBLOCK learns hash functions automatically from attribute domains and a labeled dataset consisting of duplicates. Subsequently, CBLOCK expresses blocking functions using a hierarchical tree structure composed of atomic hash functions. The application may guide the automated blocking process based on architectural constraints, such as by specifying a maximum size of each block (based on memory requirements), impose disjointness of blocks (in a grid environment), or specify a particular objective function trading off recall for efficiency. As a post-processing step to automatically generated blocks, CBLOCK {\em rolls-up} smaller blocks to increase recall. We present experimental results on two large-scale de-duplication datasets at Yahoo!---consisting of over 140K movies and 40K restaurants respectively---and demonstrate the utility of CBLOCK. },
  author = {Sarma, Anish Das and Jain, Ankur and Machanavajjhala, Ashwin and Bohannon, Philip},
  interhash = {3f32848ef4bb26a3057c3feadff99c5a},
  intrahash = {389dba4432b1340211ef6be8e3d45a1d},
  note = {cite arxiv:1111.3689},
  title = {CBLOCK: An Automatic Blocking Mechanism for Large-Scale De-duplication   Tasks},
  url = {http://arxiv.org/abs/1111.3689},
  year = 2011
}

@article{Rosvall29012008,
  abstract = {To comprehend the multipartite organization of large-scale biological and social systems, we introduce an information theoretic approach that reveals community structure in weighted and directed networks. We use the probability flow of random walks on a network as a proxy for information flows in the real system and decompose the network into modules by compressing a description of the probability flow. The result is a map that both simplifies and highlights the regularities in the structure and their relationships. We illustrate the method by making a map of scientific communication as captured in the citation patterns of >6,000 journals. We discover a multicentric organization with fields that vary dramatically in size and degree of integration into the network of science. Along the backbone of the network—including physics, chemistry, molecular biology, and medicine—information flows bidirectionally, but the map reveals a directional pattern of citation from the applied fields to the basic sciences.},
  author = {Rosvall, Martin and Bergstrom, Carl T.},
  doi = {10.1073/pnas.0706851105},
  eprint = {http://www.pnas.org/content/105/4/1118.full.pdf+html},
  interhash = {8192f8db9fce0417034311e81a477838},
  intrahash = {ffe2c7ca3a20430f60dfd138e72df5f5},
  journal = {Proceedings of the National Academy of Sciences},
  number = 4,
  pages = {1118-1123},
  title = {Maps of random walks on complex networks reveal community structure},
  url = {http://www.pnas.org/content/105/4/1118.abstract},
  volume = 105,
  year = 2008
}