@inproceedings{baur2007generating, abstract = {The modeling of realistic networks is of great importance for modern complex systems research. Previous procedures typically model the natural growth of networks by means of iteratively adding nodes, geometric positioning information, a definition of link connectivity based on the preference for nearest neighbors or already highly connected nodes, or combine several of these approaches. Our novel model is based on the well-know concept of k-cores, originally introduced in social network analysis. Recent studies exposed the significant k-core structure of several real world systems, e.g. the AS network of the Internet. We present a simple and efficient method for generating networks which strictly adhere to the characteristics of a given k-core structure, called core fingerprint. We show-case our algorithm in a comparative evaluation with two well-known AS network generators. }, author = {Baur, Michael and Gaertler, Marco and Görke, Robert and Krug, Marcus and Wagner, Dorothea}, booktitle = {Proceedings of the European Conference of Complex Systems}, interhash = {387eebb80bbfaafab5ac201c88ebd263}, intrahash = {e2fef8dce15087afbcc3489f2029d2c6}, month = oct, title = {Generating Graphs with Predefined k-Core Structure}, url = {http://i11www.ira.uka.de/extra/publications/bggkw-ggpcs-07.pdf}, year = 2007 } @inproceedings{angelova2008characterizing, abstract = {Social networks and collaborative tagging systems are rapidly gaining popularity as a primary means for storing and sharing data among friends, family, colleagues, or perfect strangers as long as they have common interests. del.icio.us is a social network where people store and share their personal bookmarks. Most importantly, users tag their bookmarks for ease of information dissemination and later look up. However, it is the friendship links, that make delicious a social network. They exist independently of the set of bookmarks that belong to the users and have no relation to the tags typically assigned to the bookmarks. To study the interaction among users, the strength of the existing links and their hidden meaning, we introduce implicit links in the network. These links connect only highly "similar" users. Here, similarity can reflect different aspects of the user’s profile that makes her similar to any other user, such as number of shared bookmarks, or similarity of their tags clouds. We investigate the question whether friends have common interests, we gain additional insights on the strategies that users use to assign tags to their bookmarks, and we demonstrate that the graphs formed by implicit links have unique properties differing from binomial random graphs or random graphs with an expected power-law degree distribution. }, author = {Angelova, Ralitsa and Lipczak, Marek and Milios, Evangelos and Prałat, Paweł}, booktitle = {Proceedings of the Mining Social Data Workshop (MSoDa)}, interhash = {f74d27a66d2754f3d5892d68c4abee4c}, intrahash = {02d6739886a13180dd92fbb7243ab58b}, month = jul, organization = {ECAI 2008}, pages = {21--25}, title = {Characterizing a social bookmarking and tagging network}, url = {http://www.math.ryerson.ca/~pralat/papers/2008_delicious.pdf}, year = 2008 } @article{nanba2000classification, abstract = {We are investigating automatic generation of a review (or survey) article in a specific subject domain. In a research paper, there are passages where the author describes the essence of a cited paper and the differences between the current paper and the cited paper (we call them citing areas). These passages can be considered as a kind of summary of the cited paper from the current author's viewpoint. We can know the state of the art in a specific subject domain from the collection of citing areas. FUrther, if these citing areas are properly classified and organized, they can act 8.', a kind of a review article. In our previous research, we proposed the automatic extraction of citing areas. Then, with the information in the citing areas, we automatically identified the types of citation relationships that indicate the reasons for citation (we call them citation types). Citation types offer a useful clue for organizing citing areas. In addition, to support writing a review article, it is necessary to take account of the contents of the papers together with the citation links and citation types. In this paper, we propose several methods for classifying papers automatically. We found that our proposed methods BCCT-C, the bibliographic coupling considering only type C citations, which pointed out the problems or gaps in related works, are more effective than others. We also implemented a prototype system to support writing a review article, which is based on our proposed method.}, author = {Nanba, H. and Kando, N. and Okumura, M.}, interhash = {a8fbc36d3ee8de28f65ef2486bb18cd2}, intrahash = {7a99ee2d1444ae569beb7bee04137e4b}, journal = {11th ASIS SIG/CR Classification Research Workshop}, misc = {10.7152/acro.v11i1.12774}, pages = {117--134}, title = {Classification of research papers using citation links and citation types: Towards automatic review article generation}, url = {http://journals.lib.washington.edu/index.php/acro/article/download/12774/11255}, year = 2000 } @article{liu2012fulltext, author = {Liu, Xiaozhong and Zhang, Jinsong and Guo, Chun}, interhash = {011df26355ad51a88947017fd2791a98}, intrahash = {f9c6133bf4503003822f99860f864698}, journal = {Journal of the American Society for Information Science and Technology}, title = {Full-Text Citation Analysis: A New Method to Enhance Scholarly Network}, url = {http://discern.uits.iu.edu:8790/publication/Full%20text%20citation.pdf}, year = 2012 } @inproceedings{yan2012better, abstract = {Usually scientists breed research ideas inspired by previous publications, but they are unlikely to follow all publications in the unbounded literature collection. The volume of literature keeps on expanding extremely fast, whilst not all papers contribute equal impact to the academic society. Being aware of potentially influential literature would put one in an advanced position in choosing important research references. Hence, estimation of potential influence is of great significance. We study a challenging problem of identifying potentially influential literature. We examine a set of hypotheses on what are the fundamental characteristics for highly cited papers and find some interesting patterns. Based on these observations, we learn to identify potentially influential literature via Future Influence Prediction (FIP), which aims to estimate the future influence of literature. The system takes a series of features of a particular publication as input and produces as output the estimated citation counts of that article after a given time period. We consider several regression models to formulate the learning process and evaluate their performance based on the coefficient of determination (R2). Experimental results on a real-large data set show a mean average predictive performance of 83.6% measured in R^2. We apply the learned model to the application of bibliography recommendation and obtain prominent performance improvement in terms of Mean Average Precision (MAP).}, acmid = {2232831}, address = {New York, NY, USA}, author = {Yan, Rui and Huang, Congrui and Tang, Jie and Zhang, Yan and Li, Xiaoming}, booktitle = {Proceedings of the 12th ACM/IEEE-CS joint conference on Digital Libraries}, doi = {10.1145/2232817.2232831}, interhash = {85d10c6d37bcbfa057c51acc325a8116}, intrahash = {9269d2dd9bf4bc8c0e7c668011fcfc1b}, isbn = {978-1-4503-1154-0}, location = {Washington, DC, USA}, numpages = {10}, pages = {51--60}, publisher = {ACM}, series = {JCDL '12}, title = {To better stand on the shoulder of giants}, url = {http://doi.acm.org/10.1145/2232817.2232831}, year = 2012 } @article{larowe2009scholarly, abstract = {The Scholarly Database aims to serve researchers and practitioners interested in the analysis, modelling, and visualization of large-scale data sets. A specific focus of this database is to support macro-evolutionary studies of science and to communicate findings via knowledge-domain visualizations. Currently, the database provides access to about 18 million publications, patents, and grants. About 90% of the publications are available in full text. Except for some datasets with restricted access conditions, the data can be retrieved in raw or pre-processed formats using either a web-based or a relational database client. This paper motivates the need for the database from the perspective of bibliometric/scientometric research. It explains the database design, setup, etc., and reports the temporal, geographical, and topic coverage of data sets currently served via the database. Planned work and the potential for this database to become a global testbed for information science research are discussed at the end of the paper.}, author = {La Rowe, Gavin and Ambre, Sumeet and Burgoon, John and Ke, Weimao and Börner, Katy}, doi = {10.1007/s11192-009-0414-2}, interhash = {1819f263b0ea1b99ec15d0c22b38207e}, intrahash = {c24611ec1f2efbdcf7f5b26d49af320e}, issn = {0138-9130}, journal = {Scientometrics}, language = {English}, number = 2, pages = {219--234}, publisher = {Springer Netherlands}, title = {The Scholarly Database and its utility for scientometrics research}, url = {http://dx.doi.org/10.1007/s11192-009-0414-2}, volume = 79, year = 2009 } @article{batagelj2011algorithms, abstract = {The structure of a large network (graph) can often be revealed by partitioning it into smaller and possibly more dense sub-networks that are easier to handle. One of such decompositions is based on “}, author = {Batagelj, Vladimir and Zaveršnik, Matjaž}, doi = {10.1007/s11634-010-0079-y}, interhash = {a0bd7331f81bb4da72ce115d5943d6e4}, intrahash = {cd0d5266688af6bb98bde7f99e3a54c1}, issn = {1862-5347}, journal = {Advances in Data Analysis and Classification}, language = {English}, number = 2, pages = {129--145}, publisher = {Springer}, title = {Fast algorithms for determining (generalized) core groups in social networks}, url = {http://dx.doi.org/10.1007/s11634-010-0079-y}, volume = 5, year = 2011 } @inproceedings{zesch2007analysis, abstract = {In this paper, we discuss two graphs in Wikipedia (i) the article graph, and (ii) the category graph. We perform a graph-theoretic analysis of the category graph, and show that it is a scale-free, small world graph like other well-known lexical semantic networks. We substantiate our findings by transferring semantic relatedness algorithms defined on WordNet to the Wikipedia category graph. To assess the usefulness of the category graph as an NLP resource, we analyze its coverage and the performance of the transferred semantic relatedness algorithms. }, address = {Rochester}, author = {Zesch, Torsten and Gurevych, Iryna}, booktitle = {Proceedings of the TextGraphs-2 Workshop (NAACL-HLT)}, interhash = {0401e62edb9bfa85dd498cb40301c0cb}, intrahash = {332ed720a72bf069275f93485432314b}, month = apr, pages = {1--8}, publisher = {Association for Computational Linguistics}, title = {Analysis of the Wikipedia Category Graph for NLP Applications}, url = {http://acl.ldc.upenn.edu/W/W07/W07-02.pdf#page=11}, year = 2007 } @inproceedings{takahashi2011evaluating, abstract = {We propose a method to evaluate the significance of historical entities (people, events, and so on.). Here, the significance of a historical entity means how it affected other historical entities. Our proposed method first calculates the tempo-spacial impact of historical entities. The impact of a historical entity varies according to time and location. Historical entities are collected from Wikipedia. We assume that a Wikipedia link between historical entities represents an impact propagation. That is, when an entity has a link to another entity, we regard the former is influenced by the latter. Historical entities in Wikipedia usually have the date and location of their occurrence. Our proposed iteration algorithm propagates such initial tempo-spacial information through links in the similar manner as PageRank, so the tempo-spacial impact scores of all the historical entities can be calculated. We assume that a historical entity is significant if it influences many other entities that are far from it temporally or geographically. We demonstrate a prototype system and show the results of experiments that prove the effectiveness of our method.}, acmid = {1995980}, address = {New York, NY, USA}, author = {Takahashi, Yuku and Ohshima, Hiroaki and Yamamoto, Mitsuo and Iwasaki, Hirotoshi and Oyama, Satoshi and Tanaka, Katsumi}, booktitle = {Proceedings of the 22nd ACM conference on Hypertext and hypermedia}, doi = {10.1145/1995966.1995980}, interhash = {6665836546bedb1ee5d56a4d16a0848e}, intrahash = {e4769d86e71c9e7ba77d5d4af6f21e0c}, isbn = {978-1-4503-0256-2}, location = {Eindhoven, The Netherlands}, numpages = {10}, pages = {83--92}, publisher = {ACM}, title = {Evaluating significance of historical entities based on tempo-spatial impacts analysis using Wikipedia link structure}, url = {http://doi.acm.org/10.1145/1995966.1995980}, year = 2011 } @inproceedings{ollivier2007finding, abstract = {We introduce a new method for finding nodes semantically related to a given node in a hyperlinked graph: the Green method, based on a classical Markov chain tool. It is generic, adjustment-free and easy to implement. We test it in the case of the hyperlink structure of the English version of Wikipedia, the on-line encyclopedia. We present an extensive comparative study of the performance of our method versus several other classical methods in the case of Wikipedia. The Green method is found to have both the best average results and the best robustness.}, acmid = {1619874}, author = {Ollivier, Yann and Senellart, Pierre}, booktitle = {Proceedings of the 22nd national conference on Artificial intelligence}, interhash = {a291b1b4e195dd09a11c8ffe329fc0e5}, intrahash = {76e219fe6e8a257b30c6665af8b273da}, isbn = {978-1-57735-323-2}, location = {Vancouver, British Columbia, Canada}, numpages = {7}, pages = {1427--1433}, publisher = {AAAI Press}, title = {Finding related pages using Green measures: an illustration with Wikipedia}, url = {http://dl.acm.org/citation.cfm?id=1619797.1619874}, volume = 2, year = 2007 } @article{barabsi2013network, abstract = {Professor Barabási's talk described how the tools of network science can help understand the Web's structure, development and weaknesses. The Web is an information network, in which the nodes are documents (at the time of writing over one trillion of them), connected by links. Other well-known network structures include the Internet, a physical network where the nodes are routers and the links are physical connections, and organizations, where the nodes are people and the links represent communications.}, author = {Barabási, Albert-László}, doi = {10.1098/rsta.2012.0375}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120375.full.pdf+html}, interhash = {e2cfdd2e3c7c68581e3ab691909ed28b}, intrahash = {208c1f9d6d8eff67cee07ebdf3cd0fc1}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, number = 1987, title = {Network science}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120375.abstract}, volume = 371, year = 2013 } @article{kleinberg2013analysis, abstract = {The growth of the Web has required us to think about the design of information systems in which large-scale computational and social feedback effects are simultaneously at work. At the same time, the data generated by Web-scale systems—recording the ways in which millions of participants create content, link information, form groups and communicate with one another—have made it possible to evaluate long-standing theories of social interaction, and to formulate new theories based on what we observe. These developments have created a new level of interaction between computing and the social sciences, enriching the perspectives of both of these disciplines. We discuss some of the observations, theories and conclusions that have grown from the study of Web-scale social interaction, focusing on issues including the mechanisms by which people join groups, the ways in which different groups are linked together in social networks and the interplay of positive and negative interactions in these networks.}, author = {Kleinberg, Jon}, doi = {10.1098/rsta.2012.0378}, eprint = {http://rsta.royalsocietypublishing.org/content/371/1987/20120378.full.pdf+html}, interhash = {b4686f01da53c975f342dbb40bdd1a90}, intrahash = {e3898cfb7206a7fee8eb3a5419aa030f}, journal = {Philosophical Transactions of the Royal Society A: Mathematical, Physical and Engineering Sciences}, month = mar, number = 1987, title = {Analysis of large-scale social and information networks}, url = {http://rsta.royalsocietypublishing.org/content/371/1987/20120378.abstract}, volume = 371, year = 2013 } @article{obiedkov2009building, abstract = {The use of lattice-based access control models has been somewhat restricted by their complexity. We argue that attribute exploration from formal concept analysis can help create lattice models of manageable size, while making it possible for the system designer to better understand dependencies between different security categories in the domain and, thus, providing certain guarantees for the relevance of the constructed model to a particular application. In this paper, we introduce the method through an example.}, author = {Obiedkov, Sergei and Kourie, Derrick G. and Eloff, J.H.P.}, doi = {10.1016/j.cose.2008.07.011}, interhash = {367ceb95cd5e3964aa2d7d00ad21da09}, intrahash = {7be2b4bf0987c4d18adf7243eae690c0}, issn = {0167-4048}, journal = {Computers and Security}, number = {1–2}, pages = {2--7}, title = {Building access control models with attribute exploration}, url = {http://www.sciencedirect.com/science/article/pii/S0167404808000497}, volume = 28, year = 2009 } @inproceedings{pfaltz2012entropy, abstract = {We introduce the concepts of closed sets and closure operators as mathematical tools for the study of social networks. Dynamic networks are represented by transformations. It is shown that under continuous change/transformation, all networks tend to "break down" and become less complex. It is a kind of entropy. The product of this theoretical decomposition is an abundance of triadically closed clusters which sociologists have observed in practice. This gives credence to the relevance of this kind of mathematical analysis in the sociological context. }, author = {Pfaltz, John L.}, booktitle = {Proceedings of the SOCINFO}, interhash = {753f13a5ffaa0946220164c2b05c230f}, intrahash = {044d0b1f6e737bede270a40bbddb0b06}, title = {Entropy in Social Networks}, year = 2012 } @article{birkholz2012scalable, abstract = {Studies on social networks have proved that endogenous and exogenous factors influence dynamics. Two streams of modeling exist on explaining the dynamics of social networks: 1) models predicting links through network properties, and 2) models considering the effects of social attributes. In this interdisciplinary study we work to overcome a number of computational limitations within these current models. We employ a mean-field model which allows for the construction of a population-specific socially informed model for predicting links from both network and social properties in large social networks. The model is tested on a population of conference coauthorship behavior, considering a number of parameters from available Web data. We address how large social networks can be modeled preserving both network and social parameters. We prove that the mean-field model, using a data-aware approach, allows us to overcome computational burdens and thus scalability issues in modeling large social networks in terms of both network and social parameters. Additionally, we confirm that large social networks evolve through both network and social-selection decisions; asserting that the dynamics of networks cannot singly be studied from a single perspective but must consider effects of social parameters. }, author = {Birkholz, Julie M. and Bakhshi, Rena and Harige, Ravindra and van Steen, Maarten and Groenewegen, Peter}, interhash = {a8ef0aac2eab74fc8eb3f9d3dc8a32dd}, intrahash = {aefcc2aa922b048bec85d5070494ed81}, journal = {CoRR}, month = sep, title = {Scalable Analysis of Socially Informed Network Models: the data-aware mean-field approach }, url = {http://arxiv.org/abs/1209.6615}, volume = {abs/1209.6615}, year = 2012 } @inproceedings{baader2007completing, abstract = {We propose an approach for extending both the terminological and the assertional part of a Description Logic knowledge base by using information provided by the knowledge base and by a domain expert. The use of techniques from Formal Concept Analysis ensures that, on the one hand, the interaction with the expert is kept to a minimum, and, on the other hand, we can show that the extended knowledge base is complete in a certain, well-defined sense.}, acmid = {1625311}, address = {San Francisco, CA, USA}, author = {Baader, Franz and Ganter, Bernhard and Sertkaya, Baris and Sattler, Ulrike}, booktitle = {Proceedings of the 20th international joint conference on Artifical intelligence}, interhash = {8ab382f3aa141674412ba7ad33316a9b}, intrahash = {87f98ae486014ba78690ffa314b67da8}, location = {Hyderabad, India}, numpages = {6}, pages = {230--235}, publisher = {Morgan Kaufmann Publishers Inc.}, title = {Completing description logic knowledge bases using formal concept analysis}, url = {http://dl.acm.org/citation.cfm?id=1625275.1625311}, year = 2007 } @book{koester2006fooca, abstract = {This book deals with Formal Concept Analysis (FCA) and its application to Web Information Retrieval. It explains how Web search results retrieved by major Web search engines such as Google or Yahoo can be conceptualized leading to a human-oriented form of representation. A generalization of Web search results is conducted, leading to an FCA-based introduction of FooCA. FooCA is an application in the field of Conceptual Knowledge Processing and supports the idea of a holistic representation of Web Information Retrieval.}, address = {Mühltal}, author = {Koester, Bjoern}, interhash = {fe53b2b1fa6be34259647954fca36bf8}, intrahash = {5571d950ada3ee1892e5c043ac438271}, publisher = {Verlag Allgemeine Wissenschaft}, series = {Beiträge zur begrifflichen Wissensverarbeitung}, title = {FooCA: web information retrieval with formal concept analysis}, url = {http://www.bjoern-koester.de/fooca/web_information_retrieval_with_formal_concept_analysis.html}, year = 2006 } @article{ley2009lessons, abstract = {The DBLP Computer Science Bibliography evolved from an early small experimental Web server to a popular service for the computer science community. Many design decisions and details of the public XML-records behind DBLP never were documented. This paper is a review of the evolution of DBLP. The main perspective is data modeling. In DBLP persons play a central role, our discussion of person names may be applicable to many other data bases. All DBLP data are available for your own experiments. You may either download the complete set, or use a simple XML-based API described in an online appendix.}, acmid = {1687577}, author = {Ley, Michael}, interhash = {a75ae2987d55512b7d0731c7a11a1722}, intrahash = {bb968ff4ba9ae93bc80ba05d16a98ff4}, issn = {2150-8097}, issue_date = {August 2009}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 2, numpages = {8}, pages = {1493--1500}, publisher = {VLDB Endowment}, title = {DBLP: some lessons learned}, url = {http://dl.acm.org/citation.cfm?id=1687553.1687577}, volume = 2, year = 2009 } @article{poelmans2012semiautomated, abstract = {We propose an iterative and human-centred knowledge discovery methodology based on formal concept analysis. The proposed approach recognizes the important role of the domain expert in mining real-world enterprise applications and makes use of specific domain knowledge, including human intelligence and domain-specific constraints. Our approach was empirically validated at the Amsterdam-Amstelland police to identify suspects and victims of human trafficking in 266,157 suspicious activity reports. Based on guidelines of the Attorney Generals of the Netherlands, we first defined multiple early warning indicators that were used to index the police reports. Using concept lattices, we revealed numerous unknown human trafficking and loverboy suspects. In-depth investigation by the police resulted in a confirmation of their involvement in illegal activities resulting in actual arrestments been made. Our human-centred approach was embedded into operational policing practice and is now successfully used on a daily basis to cope with the vastly growing amount of unstructured information.}, author = {Poelmans, Jonas and Elzinga, Paul and Ignatov, Dmitry I. and Kuznetsov, Sergei O.}, doi = {10.1080/03081079.2012.721662}, eprint = {http://www.tandfonline.com/doi/pdf/10.1080/03081079.2012.721662}, interhash = {18d6f6312af57cc72d7e26de4903dc9f}, intrahash = {9bb41c50dd5333f94a807482489c0732}, journal = {International Journal of General Systems}, number = 8, pages = {774--804}, title = {Semi-automated knowledge discovery: identifying and profiling human trafficking}, url = {http://www.tandfonline.com/doi/abs/10.1080/03081079.2012.721662}, volume = 41, year = 2012 } @incollection{becker2000conceptual, abstract = {Conceptual Information Systems are based on a formalization of the concept of ‘concept’ as it is discussed in traditional philosophical logic. This formalization supports a human-centered approach to the development of Information Systems. We discuss this approach by means of an implemented Conceptual Information System for supporting IT security management in companies and organizations.}, address = {Berlin/Heidelberg}, affiliation = {Entrust Technologies (Switzerland) Ltd liab. Co Glatt Tower CH-8301 Glattzentrum Switzerland}, author = {Becker, Klaus and Stumme, Gerd and Wille, Rudolf and Wille, Uta and Zickwolff, Monika}, booktitle = {Knowledge Engineering and Knowledge Management Methods, Models, and Tools}, doi = {10.1007/3-540-39967-4_27}, editor = {Dieng, Rose and Corby, Olivier}, interhash = {dacb08013d9496d41d4f9f39bce7ecd1}, intrahash = {283f8a780ac47746cc3031ad47bfdf9c}, isbn = {978-3-540-41119-2}, keyword = {Computer Science}, pages = {352--365}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Conceptual Information Systems Discussed through an IT-Security Tool}, url = {http://dx.doi.org/10.1007/3-540-39967-4_27}, volume = 1937, year = 2000 } @incollection{stumme1998conceptual, abstract = {In this paper we discuss Conceptual Knowledge Discovery in Databases (CKDD) as it is developing in the field of Conceptual Knowledge Processing (cf. [29],[30]). Conceptual Knowledge Processing is based on the mathematical theory of Formal Concept Analysis which has become a successful theory for data analysis during the last 18 years. This approach relies on the pragmatic philosophy of Ch.S. Peirce [15] who claims that we can only analyze and argue within restricted contexts where we always rely on pre-knowledge and common sense. The development of Formal Concept Analysis led to the software system TOSCANA, which is presented as a CKDD tool in this paper. TOSCANA is a flexible navigation tool that allows dynamic browsing through and zooming into the data. It supports the exploration of large databases by visualizing conceptual aspects inherent to the data. We want to clarify that CKDD can be understood as a human-centered approach of Knowledge Discovery in Databases. The actual discussion about human-centered Knowledge Discovery is therefore briefly summarized in Section 1.}, address = {Berlin/Heidelberg}, affiliation = {Technische Universität Darmstadt Fachbereich Mathematik D-64289 Darmstadt Germany D-64289 Darmstadt Germany}, author = {Stumme, Gerd and Wille, Rudolf and Wille, Uta}, booktitle = {Principles of Data Mining and Knowledge Discovery}, doi = {10.1007/BFb0094849}, editor = {Zytkow, Jan and Quafafou, Mohamed}, interhash = {5ef89b6f8fb22f9d24eda7da71b8bdb1}, intrahash = {a9859c988f19684b76dc5a3f24e8278e}, isbn = {978-3-540-65068-3}, keyword = {Computer Science}, pages = {450--458}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Conceptual Knowledge Discovery in Databases using formal concept analysis methods}, url = {http://dx.doi.org/10.1007/BFb0094849}, volume = 1510, year = 1998 } @inproceedings{shipman1999beyond, acmid = {294498}, address = {New York, NY, USA}, author = {{Shipman, III}, Frank M. and Marshall, Catherine C. and LeMere, Mark}, booktitle = {Proceedings of the tenth ACM Conference on Hypertext and hypermedia : returning to our diverse roots: returning to our diverse roots}, doi = {10.1145/294469.294498}, interhash = {af9b4a36c9dfe926d433aa88aea22573}, intrahash = {edf6ad72b8b8caa5dccd8219bc0ea498}, isbn = {1-58113-064-3}, location = {Darmstadt, Germany}, numpages = {10}, pages = {121--130}, publisher = {ACM}, title = {Beyond location: hypertext workspaces and non-linear views}, url = {http://doi.acm.org/10.1145/294469.294498}, year = 1999 } @article{dainotti2012extracting, abstract = {Unsolicited one-way Internet traffic, also called Internet background radiation (IBR), has been used for years to study malicious activity on the Internet, including worms, DoS attacks, and scanning address space looking for vulnerabilities to exploit. We show how such traffic can also be used to analyze macroscopic Internet events that are unrelated to malware. We examine two phenomena: country-level censorship of Internet communications described in recent work, and natural disasters (two recent earthquakes). We introduce a new metric of local IBR activity based on the number of unique IP addresses per hour contributing to IBR. The advantage of this metric is that it is not affected by bursts of traffic from a few hosts. Although we have only scratched the surface, we are convinced that IBR traffic is an important building block for comprehensive monitoring, analysis, and possibly even detection of events unrelated to the IBR itself. In particular, IBR offers the opportunity to monitor the impact of events such as natural disasters on network infrastructure, and in particular reveals a view of events that is complementary to many existing measurement platforms based on (BGP) control-plane views or targeted active ICMP probing.}, acmid = {2096154}, address = {New York, NY, USA}, author = {Dainotti, Alberto and Amman, Roman and Aben, Emile and Claffy, Kimberly C.}, doi = {10.1145/2096149.2096154}, interhash = {7c97d11971a9b652c00e0487bfc79d54}, intrahash = {dc796731675fad942af97e1bd0c17366}, issn = {0146-4833}, journal = {SIGCOMM Computer Communication Review}, month = jan, number = 1, numpages = {9}, pages = {31--39}, publisher = {ACM}, title = {Extracting benefit from harm: using malware pollution to analyze the impact of political and geophysical events on the internet}, url = {http://doi.acm.org/10.1145/2096149.2096154}, volume = 42, year = 2012 } @article{alsubaiee2012asterix, abstract = {At UC Irvine, we are building a next generation parallel database system, called ASTERIX, as our approach to addressing today's "Big Data" management challenges. ASTERIX aims to combine time-tested principles from parallel database systems with those of the Web-scale computing community, such as fault tolerance for long running jobs. In this demo, we present a whirlwind tour of ASTERIX, highlighting a few of its key features. We will demonstrate examples of our data definition language to model semi-structured data, and examples of interesting queries using our declarative query language. In particular, we will show the capabilities of ASTERIX for answering geo-spatial queries and fuzzy queries, as well as ASTERIX' data feed construct for continuously ingesting data.}, acmid = {2367532}, author = {Alsubaiee, Sattam and Altowim, Yasser and Altwaijry, Hotham and Behm, Alexander and Borkar, Vinayak and Bu, Yingyi and Carey, Michael and Grover, Raman and Heilbron, Zachary and Kim, Young-Seok and Li, Chen and Onose, Nicola and Pirzadeh, Pouria and Vernica, Rares and Wen, Jian}, interhash = {ae521b66302adb1b7df3f4cdb8d92181}, intrahash = {003f2654ae41861cfb77bf0353634ac3}, issn = {2150-8097}, issue_date = {August 2012}, journal = {Proceedings of the VLDB Endowment}, month = aug, number = 12, numpages = {4}, pages = {1898--1901}, publisher = {VLDB Endowment}, title = {ASTERIX: an open source system for "Big Data" management and analysis (demo)}, url = {http://dl.acm.org/citation.cfm?id=2367502.2367532}, volume = 5, year = 2012 } @inproceedings{weikum2011longitudinal, abstract = {Organizations like the Internet Archive have been capturing Web contents over decades, building up huge repositories of time-versioned pages. The timestamp annotations and the sheer volume of multi-modal content constitutes a gold mine for analysts of all sorts, across diff�erent application areas, from political analysts and marketing agencies to academic researchers and product developers. In contrast to traditional data analytics on click logs, the focus is on longitudinal studies over very long horizons. This longitudinal aspect affects and concerns all data and metadata, from the content itself, to the indices and the statistical metadata maintained for it. Moreover, advanced analysts prefer to deal with semantically rich entities like people, places, organizations, and ideally relationships such as company acquisitions, instead of, say, Web pages containing such references. For example, tracking and analyzing a politician's public appearances over a decade is much harder than mining frequently used query words or frequently clicked URLs for the last month. The huge size of Web archives adds to the complexity of this daunting task. This paper discusses key challenges, that we intend to take up, which are posed by this kind of longitudinal analytics: time-travel indexing and querying, entity detection and tracking along the time axis, algorithms for advanced analyses and knowledge discovery, and scalability and platform issues.}, author = {Weikum, Gerhard and Ntarmos, Nikos and Spaniol, Marc and Triantafillou, Peter and Benczúr, András and Kirkpatrick, Scott and Rigaux, Philippe and Williamson, Mark}, booktitle = {Proceedings of the 5th Biennial Conference on Innovative Data Systems Research}, interhash = {2d84fdbf82a84bfc557056df3d0dcf11}, intrahash = {6ffcc0d793bbe53bf6ed17f9d929846e}, month = jan, pages = {199--202}, title = {Longitudinal Analytics on Web Archive Data: It's About Time!}, url = {http://www.cidrdb.org/cidr2011/Papers/CIDR11_Paper26.pdf}, year = 2011 } @article{melnik2010dremel, abstract = {Dremel is a scalable, interactive ad-hoc query system for analysis of read-only nested data. By combining multi-level execution trees and columnar data layout, it is capable of running aggregation queries over trillion-row tables in seconds. The system scales to thousands of CPUs and petabytes of data, and has thousands of users at Google. In this paper, we describe the architecture and implementation of Dremel, and explain how it complements MapReduce-based computing. We present a novel columnar storage representation for nested records and discuss experiments on few-thousand node instances of the system.}, acmid = {1920886}, author = {Melnik, Sergey and Gubarev, Andrey and Long, Jing Jing and Romer, Geoffrey and Shivakumar, Shiva and Tolton, Matt and Vassilakis, Theo}, interhash = {43835c06736099c3ebc4aaa1c9d38dbb}, intrahash = {5dae1fdc088eb801ef7663d3b35120ed}, issn = {2150-8097}, issue_date = {September 2010}, journal = {Proceedings of the VLDB Endowment}, month = sep, number = {1-2}, numpages = {10}, pages = {330--339}, publisher = {VLDB Endowment}, title = {Dremel: interactive analysis of web-scale datasets}, url = {http://dl.acm.org/citation.cfm?id=1920841.1920886}, volume = 3, year = 2010 } @article{cho2006stanford, abstract = {We describe the design and performance of WebBase, a tool for Web research. The system includes a highly customizable crawler, a repository for collected Web pages, an indexer for both text and link-related page features, and a high-speed content distribution facility. The distribution module enables researchers world-wide to retrieve pages from WebBase, and stream them across the Internet at high speed. The advantage for the researchers is that they need not all crawl the Web before beginning their research. WebBase has been used by scores of research and teaching organizations world-wide, mostly for investigations into Web topology and linguistic content analysis. After describing the system's architecture, we explain our engineering decisions for each of the WebBase components, and present respective performance measurements.}, acmid = {1149124}, address = {New York, NY, USA}, author = {Cho, Junghoo and Garcia-Molina, Hector and Haveliwala, Taher and Lam, Wang and Paepcke, Andreas and Raghavan, Sriram and Wesley, Gary}, doi = {10.1145/1149121.1149124}, interhash = {bebbc072ea2dccf4c2b27abf244c1f08}, intrahash = {3cd21bf8a87619e0489b8da177c9f0b4}, issn = {1533-5399}, issue_date = {May 2006}, journal = {ACM Transactions on Internet Technology}, month = may, number = 2, numpages = {34}, pages = {153--186}, publisher = {ACM}, title = {Stanford WebBase components and applications}, url = {http://doi.acm.org/10.1145/1149121.1149124}, volume = 6, year = 2006 } @incollection{kitsuregawa2008sociosense, abstract = {We introduce Socio-Sense Web analysis system. The system applies structural and temporal analysis methods to long term Web archive to obtain insight into the real society. We present an overview of the system and core methods followed by excerpts from case studies on consumer behavior analyses.}, address = {Berlin/Heidelberg}, affiliation = {The University of Tokyo Institute of Industrial Science 4–6–1 Komaba Meguro-ku Tokyo 153-8505 Japan}, author = {Kitsuregawa, Masaru and Tamura, Takayuki and Toyoda, Masashi and Kaji, Nobuhiro}, booktitle = {Progress in WWW Research and Development}, doi = {10.1007/978-3-540-78849-2_1}, editor = {Zhang, Yanchun and Yu, Ge and Bertino, Elisa and Xu, Guandong}, interhash = {a0ac63893d45095766b0f5fc8fd78139}, intrahash = {76f35c47a4b65f229269ea9ea39829d8}, isbn = {978-3-540-78848-5}, keyword = {Computer Science}, pages = {1--8}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Socio-Sense: A System for Analysing the Societal Behavior from Long Term Web Archive}, url = {http://dx.doi.org/10.1007/978-3-540-78849-2_1}, volume = 4976, year = 2008 } @article{stirling2012archives, abstract = {The Internet has been covered by legal deposit legislation in France since 2006, making web archiving one of the missions of the Bibliothèque nationale de France (BnF). Access to the web archives has been provided in the library on an experimental basis since 2008. In the context of increasing interest in many countries in web archiving and how it may best serve the needs of researchers, especially in the expanding field of Internet studies for social sciences, a qualitative study was performed, based on interviews with potential users of the web archives held at the BnF, and particularly researchers working in various areas related to the Internet. The study aimed to explore their needs in terms of both content and services, and also to analyse different ways of representing the archives, in order to identify ways of increasing their use. While the interest of maintaining the "memory" of the web is obvious to the researchers, they are faced with the difficulty of defining, in what is a seemingly limitless space, meaningful collections of documents. Cultural heritage institutions such as national libraries are perceived as trusted third parties capable of creating rationally-constructed and well-documented collections, but such archives raise certain ethical and methodological questions.}, author = {Stirling, Peter and Chevallier, Philippe and Illien, Gildas}, doi = {10.1045/march2012-stirling}, interhash = {a783191c99a285197525595ebf509bb2}, intrahash = {4f7840193e7e435ad5dd0003fc93691a}, issn = {1082-9873}, journal = {D-Lib Magazine}, month = {March/April }, number = {3/4}, title = {Web Archives for Researchers: Representations, Expectations and Potential Uses}, url = {http://www.dlib.org/dlib/march12/stirling/03stirling.html}, volume = 18, year = 2012 } @inproceedings{brew2010using, abstract = {Tracking sentiment in the popular media has long been of interest to media analysts and pundits. With the availability of news content via online syndicated feeds, it is now possible to automate some aspects of this process. There is also great potential to crowdsource Crowdsourcing is a term, sometimes associated with Web 2.0 technologies, that describes outsourcing of tasks to a large often anonymous community. much of the annotation work that is required to train a machine learning system to perform sentiment scoring. We describe such a system for tracking economic sentiment in online media that has been deployed since August 2009. It uses annotations provided by a cohort of non-expert annotators to train a learning system to classify a large body of news items. We report on the design challenges addressed in managing the effort of the annotators and in making annotation an interesting experience.}, acmid = {1860997}, address = {Amsterdam, The Netherlands, The Netherlands}, author = {Brew, Anthony and Greene, Derek and Cunningham, Pádraig}, booktitle = {Proceedings of the 19th European Conference on Artificial Intelligence}, editor = {Coelho, Helder and Studer, Rudi and Wooldridge, Michael}, interhash = {90650749ea1084b729710d37b5865b72}, intrahash = {9643e3c5729886b0b4e85cb3d3d704f5}, isbn = {978-1-60750-605-8}, numpages = {6}, pages = {145--150}, publisher = {IOS Press}, series = {Frontiers in Artificial Intelligence and Applications}, title = {Using Crowdsourcing and Active Learning to Track Sentiment in Online Media}, url = {http://dl.acm.org/citation.cfm?id=1860967.1860997}, volume = 215, year = 2010 } @article{pham2011development, abstract = {In contrast to many other scientific disciplines, computer science considers conference publications. Conferences have the advantage of providing fast publication of papers and of bringing researchers together to present and discuss the paper with peers. Previous work on knowledge mapping focused on the map of all sciences or a particular domain based on ISI published Journal Citation Report (JCR). Although this data cover most of the important journals, it lacks computer science conference and workshop proceedings, which results in an imprecise and incomplete analysis of the computer science knowledge. This paper presents an analysis on the computer science knowledge network constructed from all types of publications, aiming at providing a complete view of computer science research. Based on the combination of two important digital libraries (DBLP and CiteSeerX), we study the knowledge network created at journal/conference level using citation linkage, to identify the development of sub-disciplines. We investigate the collaborative and citation behavior of journals/conferences by analyzing the properties of their co-authorship and citation subgraphs. The paper draws several important conclusions. First, conferences constitute social structures that shape the computer science knowledge. Second, computer science is becoming more interdisciplinary. Third, experts are the key success factor for sustainability of journals/conferences.}, address = {Wien}, affiliation = {Information Systems and Database Technology, RWTH Aachen University, Aachen, Ahornstr. 55, 52056 Aachen, Germany}, author = {Pham, Manh and Klamma, Ralf and Jarke, Matthias}, doi = {10.1007/s13278-011-0024-x}, interhash = {193312234ed176aa8be9f35d4d1c4e72}, intrahash = {8ae08cacda75da80bfa5604cfce48449}, issn = {1869-5450}, journal = {Social Network Analysis and Mining}, keyword = {Computer Science}, number = 4, pages = {321--340}, publisher = {Springer}, title = {Development of computer science disciplines: a social network analysis approach}, url = {http://dx.doi.org/10.1007/s13278-011-0024-x}, volume = 1, year = 2011 } @inproceedings{pavlovic2012quantitative, abstract = {Formal Concept Analysis (FCA) begins from a context, given as a binary relation between some objects and some attributes, and derives a lattice of concepts, where each concept is given as a set of objects and a set of attributes, such that the first set consists of all objects that satisfy all attributes in the second, and vice versa. Many applications, though, provide contexts with quantitative information, telling not just whether an object satisfies an attribute, but also quantifying this satisfaction. Contexts in this form arise as rating matrices in recommender systems, as occurrence matrices in text analysis, as pixel intensity matrices in digital image processing, etc. Such applications have attracted a lot of attention, and several numeric extensions of FCA have been proposed. We propose the framework of proximity sets (proxets), which subsume partially ordered sets (posets) as well as metric spaces. One feature of this approach is that it extracts from quantified contexts quantified concepts, and thus allows full use of the available information. Another feature is that the categorical approach allows analyzing any universal properties that the classical FCA and the new versions may have, and thus provides structural guidance for aligning and combining the approaches.}, address = {Berlin/Heidelberg}, author = {Pavlovic, Dusko}, booktitle = {ICFCA 2012}, editor = {Domenach, F. and Ignatov, D.I. and Poelmans, J.}, ee = {http://arxiv.org/abs/1204.5802}, interhash = {601aaf1dbcb15e8872109be6f4a1a5d8}, intrahash = {a0c8122fe1a490e82129a24e042b371d}, issn = {0302-9743}, pages = {260--277}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, title = {Quantitative Concept Analysis}, volume = 7278, year = 2012 } @article{evans2010friends, abstract = {Prior research in the social search space has focused on the informational benefits of collaborating with others during web and workplace information seeking. However, social interactions, especially during complex tasks, can have cognitive benefits as well. Our goal in this paper is to document the methods and outcomes of using social resources to help with exploratory search tasks. We used a talk-aloud protocol and video capture to explore the actions of eight subjects as they completed two ''Google-hard'' search tasks. Task questions were alternated between a Social and Non-Social Condition. The Social Condition restricted participants to use only social resources-search engines were not allowed. The Non-Social Condition permitted normal web-based information sources, but restricted the use of social tools. We describe the social tactics our participants used in their search process. Asking questions on social networking sites and targeting friends one-on-one both resulted in increased information processing but during different phases of the question-answering process. Participants received more responses via social networking sites but more thorough answers in private channels (one-on-one). We discuss the possibility that the technological and cultural affordances of different social-informational media may provide complementary cognitive benefits to searchers. Our work suggests that online social tools could be better integrated with each other and with existing search facilities. We conclude with a discussion of our findings and implications for the design of social search tools. }, address = {Tarrytown, NY, USA}, author = {Evans, Brynn M. and Kairam, Sanjay and Pirolli, Peter}, doi = {10.1016/j.ipm.2009.12.001}, interhash = {b6beecb1f1fb1500a3c9b7732190e4ff}, intrahash = {835394af0d9f7776978ec7f3e10cae13}, issn = {0306-4573}, journal = {Information Processing & Management}, month = nov, number = 6, numpages = {14}, pages = {679--692}, publisher = {Pergamon Press, Inc.}, title = {Do your friends make you smarter?: An analysis of social strategies in online information seeking}, url = {http://dx.doi.org/10.1016/j.ipm.2009.12.001}, volume = 46, year = 2010 } @inproceedings{wang2010claper, abstract = {Classical papers are of great help for beginners to get familiar with a new research area. However, digging them out is a difficult problem. This paper proposes Claper, a novel academic recommendation system based on two proven principles: the Principle of Download Persistence and the Principle of Citation Approaching (we prove them based on real-world datasets). The principle of download persistence indicates that classical papers have few decreasing download frequencies since they were published. The principle of citation approaching indicates that a paper which cites a classical paper is likely to cite citations of that classical paper. Our experimental results based on large-scale real-world datasets illustrate Claper can effectively recommend classical papers of high quality to beginners and thus help them enter their research areas.}, author = {Wang, Yonggang and Zhai, Ennan and Hu, Jianbin and Chen, Zhong}, booktitle = {Proceedings of the seventh International Conference on Fuzzy Systems and Knowledge Discovery}, doi = {10.1109/FSKD.2010.5569227}, interhash = {7180ddaf1c1765a45fd244027bd0bf43}, intrahash = {7da72bf2f0538afad9377a0d50c263b4}, month = aug, pages = {2777--2781}, publisher = {IEEE}, title = {Claper: Recommend classical papers to beginners}, url = {http://ieeexplore.ieee.org/xpls/abs_all.jsp?arnumber=5569227}, volume = 6, year = 2010 } @inproceedings{he2011citation, abstract = {Automatic recommendation of citations for a manuscript is highly valuable for scholarly activities since it can substantially improve the efficiency and quality of literature search. The prior techniques placed a considerable burden on users, who were required to provide a representative bibliography or to mark passages where citations are needed. In this paper we present a system that considerably reduces this burden: a user simply inputs a query manuscript (without a bibliography) and our system automatically finds locations where citations are needed. We show that naïve approaches do not work well due to massive noise in the document corpus. We produce a successful approach by carefully examining the relevance between segments in a query manuscript and the representative segments extracted from a document corpus. An extensive empirical evaluation using the CiteSeerX data set shows that our approach is effective.}, acmid = {1935926}, address = {New York, NY, USA}, author = {He, Qi and Kifer, Daniel and Pei, Jian and Mitra, Prasenjit and Giles, C. Lee}, booktitle = {Proceedings of the fourth ACM international conference on Web search and data mining}, doi = {10.1145/1935826.1935926}, interhash = {7e98aaf26a7ed6cc624249a3ab570d7a}, intrahash = {bbd320f03d13c6cfff4b6f9e6b4630f7}, isbn = {978-1-4503-0493-1}, location = {Hong Kong, China}, numpages = {10}, pages = {755--764}, publisher = {ACM}, title = {Citation recommendation without author supervision}, url = {http://doi.acm.org/10.1145/1935826.1935926}, year = 2011 } @inproceedings{bethard2010should, abstract = {Scientists depend on literature search to find prior work that is relevant to their research ideas. We introduce a retrieval model for literature search that incorporates a wide variety of factors important to researchers, and learns the weights of each of these factors by observing citation patterns. We introduce features like topical similarity and author behavioral patterns, and combine these with features from related work like citation count and recency of publication. We present an iterative process for learning weights for these features that alternates between retrieving articles with the current retrieval model, and updating model weights by training a supervised classifier on these articles. We propose a new task for evaluating the resulting retrieval models, where the retrieval system takes only an abstract as its input and must produce as output the list of references at the end of the abstract's article. We evaluate our model on a collection of journal, conference and workshop articles from the ACL Anthology Reference Corpus. Our model achieves a mean average precision of 28.7, a 12.8 point improvement over a term similarity baseline, and a significant improvement both over models using only features from related work and over models without our iterative learning.}, acmid = {1871517}, address = {New York, NY, USA}, author = {Bethard, Steven and Jurafsky, Dan}, booktitle = {Proceedings of the 19th ACM international conference on Information and knowledge management}, doi = {10.1145/1871437.1871517}, interhash = {1cdf6c7da38af251279e9fb915266af2}, intrahash = {369206c7472baeaa5ecefef586e16c6a}, isbn = {978-1-4503-0099-5}, location = {Toronto, ON, Canada}, numpages = {10}, pages = {609--618}, publisher = {ACM}, title = {Who should I cite: learning literature search models from citation behavior}, url = {http://doi.acm.org/10.1145/1871437.1871517}, year = 2010 } @incollection{springerlink:10.1007/978-3-642-01307-2_55, address = {Berlin/Heidelberg}, affiliation = {Department of Computer Science and Technology, Tsinghua University, Beijing, 100084 China}, author = {Tang, Jie and Zhang, Jing}, booktitle = {Advances in Knowledge Discovery and Data Mining}, doi = {10.1007/978-3-642-01307-2_55}, editor = {Theeramunkong, Thanaruk and Kijsirikul, Boonserm and Cercone, Nick and Ho, Tu-Bao}, interhash = {c429474403bcd28561f8ab4fa436d036}, intrahash = {983b4eaae55e0d5e5c628a13bf58324c}, isbn = {978-3-642-01306-5}, keyword = {Computer Science}, pages = {572--579}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {A Discriminative Approach to Topic-Based Citation Recommendation}, url = {http://dx.doi.org/10.1007/978-3-642-01307-2_55}, volume = 5476, year = 2009 } @inproceedings{he2010contextaware, abstract = {When you write papers, how many times do you want to make some citations at a place but you are not sure which papers to cite? Do you wish to have a recommendation system which can recommend a small number of good candidates for every place that you want to make some citations? In this paper, we present our initiative of building a context-aware citation recommendation system. High quality citation recommendation is challenging: not only should the citations recommended be relevant to the paper under composition, but also should match the local contexts of the places citations are made. Moreover, it is far from trivial to model how the topic of the whole paper and the contexts of the citation places should affect the selection and ranking of citations. To tackle the problem, we develop a context-aware approach. The core idea is to design a novel non-parametric probabilistic model which can measure the context-based relevance between a citation context and a document. Our approach can recommend citations for a context effectively. Moreover, it can recommend a set of citations for a paper with high quality. We implement a prototype system in CiteSeerX. An extensive empirical evaluation in the CiteSeerX digital library against many baselines demonstrates the effectiveness and the scalability of our approach.}, acmid = {1772734}, address = {New York, NY, USA}, author = {He, Qi and Pei, Jian and Kifer, Daniel and Mitra, Prasenjit and Giles, Lee}, booktitle = {Proceedings of the 19th international conference on World wide web}, doi = {10.1145/1772690.1772734}, interhash = {d48586d4ee897859c5d797e671f3e384}, intrahash = {17f7aa5c8bf1d9055fd83688f46fde65}, isbn = {978-1-60558-799-8}, location = {Raleigh, North Carolina, USA}, numpages = {10}, pages = {421--430}, publisher = {ACM}, title = {Context-aware citation recommendation}, url = {http://doi.acm.org/10.1145/1772690.1772734}, year = 2010 } @inproceedings{Strohman:2007:RCA:1277741.1277868, abstract = {We approach the problem of academic literature search by considering an unpublished manuscript as a query to a search system. We use the text of previous literature as well as the citation graph that connects it to find relevant related material. We evaluate our technique with manual and automatic evaluation methods, and find an order of magnitude improvement in mean average precision as compared to a text similarity baseline.}, acmid = {1277868}, address = {New York, NY, USA}, author = {Strohman, Trevor and Croft, W. Bruce and Jensen, David}, booktitle = {Proceedings of the 30th annual international ACM SIGIR conference on Research and development in information retrieval}, doi = {10.1145/1277741.1277868}, interhash = {a34279add7d7a9f3c564735b7b8dcd44}, intrahash = {7a0b1ff2a40b3989ef8d83daabd91159}, isbn = {978-1-59593-597-7}, location = {Amsterdam, The Netherlands}, numpages = {2}, pages = {705--706}, publisher = {ACM}, title = {Recommending citations for academic papers}, url = {http://doi.acm.org/10.1145/1277741.1277868}, year = 2007 } @inproceedings{doerfel2012publication, abstract = {We present an analysis of the publication and citation networks of all previous editions of the three conferences most relevant to the FCA community: ICFCA, ICCS and CLA. Using data mining methods from FCA and graph analysis, we investigate patterns and communities among authors, we identify and visualize influential publications and authors, and we give a statistical summary of the conferences’ history. }, address = {Berlin/Heidelberg}, author = {Doerfel, Stephan and Jäschke, Robert and Stumme, Gerd}, booktitle = {Formal Concept Analysis}, doi = {10.1007/978-3-642-29892-9_12}, editor = {Domenach, F. and Ignatov, D.I. and Poelmans, J.}, interhash = {f34f31e8dd1e07b1b0a5ab688f10084a}, intrahash = {9207cd4b1cf7d87c9ae959ac780e152c}, isbn = {978-3-642-29891-2}, month = may, pages = {77--95}, publisher = {Springer}, series = {Lecture Notes in Artificial Intelligence}, title = {Publication Analysis of the Formal Concept Analysis Community}, url = {http://link.springer.com/chapter/10.1007/978-3-642-29892-9_12}, volume = 7278, year = 2012 } @inproceedings{poelmans2011mining, abstract = {Formal Concept Analysis (FCA) is an unsupervised clustering technique and many scientific papers are devoted to applying FCA in Information Retrieval (IR) research. We collected 103 papers published between 2003-2009 which mention FCA and information retrieval in the abstract, title or keywords. Using a prototype of our FCA-based toolset CORDIET, we converted the pdf-files containing the papers to plain text, indexed them with Lucene using a thesaurus containing terms related to FCA research and then created the concept lattice shown in this paper. We visualized, analyzed and explored the literature with concept lattices and discovered multiple interesting research streams in IR of which we give an extensive overview. The core contributions of this paper are the innovative application of FCA to the text mining of scientific papers and the survey of the FCA-based IR research. }, author = {Poelmans, Jonas and Elzinga, Paul and Viaene, Stijn and Dedene, Guido and Kuznetsov, Sergei O.}, booktitle = {Industrial Conference on Data Mining - Poster and Industry Proceedings}, editor = {Perner, Petra}, interhash = {b44d11ea5b5a4df8ee30a9c572d82051}, intrahash = {164c37be60c1a47d1727ad9b82f01237}, isbn = {978-3-942954-06-4}, pages = {82--96}, publisher = {IBaI Publishing}, title = {Text Mining Scientific Papers: a Survey on {FCA}-based Information Retrieval Research.}, url = {http://dblp.uni-trier.de/db/conf/incdm/incdm2011p.html#PoelmansEVDK11}, year = 2011 } @incollection{poelmans2010formal, abstract = {In this paper, we analyze the literature on Formal Concept Analysis (FCA) using FCA. We collected 702 papers published between 2003-2009 mentioning Formal Concept Analysis in the abstract. We developed a knowledge browsing environment to support our literature analysis process. The pdf-files containing the papers were converted to plain text and indexed by Lucene using a thesaurus containing terms related to FCA research. We use the visualization capabilities of FCA to explore the literature, to discover and conceptually represent the main research topics in the FCA community. As a case study, we zoom in on the 140 papers on using FCA in knowledge discovery and data mining and give an extensive overview of the contents of this literature.}, address = {Berlin/Heidelberg}, author = {Poelmans, Jonas and Elzinga, Paul and Viaene, Stijn and Dedene, Guido}, booktitle = {Conceptual Structures: From Information to Intelligence}, doi = {10.1007/978-3-642-14197-3_15}, editor = {Croitoru, Madalina and Ferré, Sébastien and Lukose, Dickson}, interhash = {713d63f847ff4b2cbf613fc0508eb31b}, intrahash = {9694689a034cc02aae1e27114ca26a94}, isbn = {978-3-642-14196-6}, pages = {139--153}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Formal Concept Analysis in Knowledge Discovery: A Survey}, url = {http://dx.doi.org/10.1007/978-3-642-14197-3_15}, volume = 6208, year = 2010 } @article{wille1995basic, abstract = {Experiences with applications of concept lattices and the pragmatic philosophy founded by Ch. S. Peirce have suggested a triadic approach to formal concept analysis. It starts with the notion of a triadic context combining objects, attributes, and conditions under which objects may have certain attributes. The Basic Theorem of triadic concept analysis clarifies the class of structures which are formed by the triadic concepts of triadic contexts: These structures are exactly the complete trilattices up to isomorphism.}, affiliation = {Fachbereich Mathematik Technishe Hochschule Darmstadt 64289 Darmstadt Germany}, author = {Wille, Rudolf}, doi = {10.1007/BF01108624}, interhash = {c5223bed0a0995c5f3cd8962a9d54212}, intrahash = {23959c014b9b6d46c93f45cf68e52294}, issn = {0167-8094}, journal = {Order}, keyword = {Mathematics and Statistics}, number = 2, pages = {149--158}, publisher = {Springer Netherlands}, title = {The Basic Theorem of triadic concept analysis}, url = {http://dx.doi.org/10.1007/BF01108624}, volume = 12, year = 1995 } @article{bollen2009clickstream, abstract = {Background Intricate maps of science have been created from citation data to visualize the structure of scientific activity. However, most scientific publications are now accessed online. Scholarly web portals record detailed log data at a scale that exceeds the number of all existing citations combined. Such log data is recorded immediately upon publication and keeps track of the sequences of user requests (clickstreams) that are issued by a variety of users across many different domains. Given these advantages of log datasets over citation data, we investigate whether they can produce high-resolution, more current maps of science. Methodology Over the course of 2007 and 2008, we collected nearly 1 billion user interactions recorded by the scholarly web portals of some of the most significant publishers, aggregators and institutional consortia. The resulting reference data set covers a significant part of world-wide use of scholarly web portals in 2006, and provides a balanced coverage of the humanities, social sciences, and natural sciences. A journal clickstream model, i.e. a first-order Markov chain, was extracted from the sequences of user interactions in the logs. The clickstream model was validated by comparing it to the Getty Research Institute's Architecture and Art Thesaurus. The resulting model was visualized as a journal network that outlines the relationships between various scientific domains and clarifies the connection of the social sciences and humanities to the natural sciences. Conclusions Maps of science resulting from large-scale clickstream data provide a detailed, contemporary view of scientific activity and correct the underrepresentation of the social sciences and humanities that is commonly found in citation data.}, author = {Bollen, Johan and van de Sompel, Herbert and Hagberg, Aric and Bettencourt, Luis and Chute, Ryan and Rodriguez, Marko A. and Balakireva, Lyudmila}, doi = {10.1371/journal.pone.0004803}, interhash = {3a371a1ed31d14204770315b52023b96}, intrahash = {e61bd0c26cc1c08cff22a8301d03044f}, journal = {PLoS ONE}, month = mar, number = 3, pages = {e4803}, publisher = {Public Library of Science}, title = {Clickstream Data Yields High-Resolution Maps of Science}, url = {http://dx.doi.org/10.1371/journal.pone.0004803}, volume = 4, year = 2009 } @article{leydesdorff2012alternatives, abstract = {Journal Impact Factors (IFs) can be considered historically as the first attempt to normalize citation distributions by using averages over two years. However, it has been recognized that citation distributions vary among fields of science and that one needs to normalize for this. Furthermore, the mean-or any central-tendency statistics-is not a good representation of the citation distribution because these distributions are skewed. Important steps have been taken to solve these two problems during the last few years. First, one can normalize at the article level using the citing audience as the reference set. Second, one can use non-parametric statistics for testing the significance of differences among ratings. A proportion of most-highly cited papers (the top-10% or top-quartile) on the basis of fractional counting of the citations may provide an alternative to the current IF. This indicator is intuitively simple, allows for statistical testing, and accords with the state of the art. }, author = {Leydesdorff, Loet}, interhash = {8d14f862a94fb45d31172f8d2a6485fa}, intrahash = {bd589cc0b6fdfc74b5eea4262c46d3a4}, journal = {Digital Libraries}, title = {Alternatives to the Journal Impact Factor: I3 and the Top-10% (or Top-25%?) of the Most-Highly Cited Papers}, url = {http://arxiv.org/abs/1201.4638}, volume = {1201.4638}, year = 2012 } @inproceedings{daquin2011extracting, abstract = {With the rise of linked data, more and more semantically described information is being published online according to the principles and technologies of the Semantic Web (especially, RDF and SPARQL). The use of such standard technologies means that this data should be exploitable, integrable and reusable straight away. However, once a potentially interesting dataset has been discovered, significant efforts are currently required in order to understand its schema, its content, the way to query it and what it can answer. In this paper, we propose a method and a tool to automatically discover questions that can be answered by an RDF dataset. We use formal concept analysis to build a hierarchy of meaningful sets of entities from a dataset. These sets of entities represent answers, which common characteristics represent the clauses of the corresponding questions. This hierarchy can then be used as a querying interface, proposing questions of varying levels of granularity and specificity to the user. A major issue is however that thousands of questions can be included in this hierarchy. Based on an empirical analysis and using metrics inspired both from formal concept analysis and from ontology summarization, we devise an approach for identifying relevant questions to act as a starting point to the navigation in the question hierarchy.}, acmid = {1999698}, address = {New York, NY, USA}, author = {d'Aquin, Mathieu and Motta, Enrico}, booktitle = {Proceedings of the sixth international conference on Knowledge capture}, doi = {10.1145/1999676.1999698}, interhash = {7794150f2b42c21956eb7fb419ca0248}, intrahash = {45374b975834248c0cd87022fc854e25}, isbn = {978-1-4503-0396-5}, location = {Banff, Alberta, Canada}, numpages = {8}, pages = {121--128}, publisher = {ACM}, title = {Extracting relevant questions to an RDF dataset using formal concept analysis}, url = {http://doi.acm.org/10.1145/1999676.1999698}, year = 2011 } @article{thijs2006influence, abstract = {In earlier studies by the authors, basic regularities of author self-citations have been analysed. These regularities are related to the ageing, to the relation between self-citations and foreign citations, to the interdependence of self-citations with other bibliometric indicators and to the influence of co-authorship on self-citation behaviour. Although both national and subject specific peculiarities influence the share of self-citations at the macro level, the authors came to the conclusion that - at this level of aggregation - there is practically no need for excluding self-citations. The aim of the present study is to answer the question in how far the influence of author self-citations on bibliometric meso-indicators deviates from that at the macro level, and to what extent national reference standards can be used in bibliometric meso analyses. In order to study the situation at the institutional level, a selection of twelve European universities representing different countries and different research profiles have been made. The results show a quite complex situation at the meso-level, therefore we suggest the usage of both indicators, including and excluding self-citations.}, affiliation = {Katholieke Universiteit Leuven, Steunpunt O&O Statistieken Leuven (Belgium) Leuven (Belgium)}, author = {Thijs, Bart and Glänzel, Wolfgang}, doi = {10.1007/s11192-006-0006-3}, interhash = {82ea078d91ba87557fb69d7fba5171bc}, intrahash = {c360454b0f49b781ccbbe16840f54b35}, issn = {0138-9130}, issue = {1}, journal = {Scientometrics}, keyword = {Computer Science}, number = 1, pages = {71--80}, publisher = {Akadémiai Kiadó}, title = {The influence of author self-citations on bibliometric meso-indicators. {The case of european universities}}, url = {http://dx.doi.org/10.1007/s11192-006-0006-3}, volume = 66, year = 2006 } @article{glanzel2004influence, abstract = {In a recent paper the authors have studied the role of author self-citations within the process of documented scientific communication. Two important regularities such as the relative fast ageing of self-citations with respect to foreign citations and the “square-root law” characterising the conditional expectation of self-citations for given number of foreign citation have been found studying the phenomenon of author self-citations at the macro level. The goal of the present paper is to study the effect of author self-citations on macro indicators. The analysis of citation based indicators for 15 fields in the sciences, social sciences and humanities substantiates that at this level of aggregation there is no need for any revision of national indicators and the underlying journal citation measures in the context of excluding self-citations.}, affiliation = {Katholieke Universiteit Leuven, Steunpunt O&O Statistieken Dekenstraat 2 B-3000 Leuven Belgium Dekenstraat 2 B-3000 Leuven Belgium}, author = {Glänzel, Wolfgang and Thijs, Bart}, doi = {10.1023/B:SCIE.0000018535.99885.e9}, interhash = {22052b43afef9648c14e5cf7f1ca5560}, intrahash = {d23fb8f74f6e1623bc0a845df9d630f0}, issn = {0138-9130}, issue = {3}, journal = {Scientometrics}, keyword = {Computer Science}, number = 3, pages = {281--310}, publisher = {Akadémiai Kiadó}, title = {The influence of author self-citations on bibliometric macro indicators}, url = {http://dx.doi.org/10.1023/B:SCIE.0000018535.99885.e9}, volume = 59, year = 2004 } @article{redner1998popular, abstract = {Numerical data for the distribution of citations are examined for: (i) papers published in 1981 in journals which are catalogued by the Institute for Scientific Information (783,339 papers) and (ii) 20 years of publications in Physical Review D, vols. 11-50 (24,296 papers). A Zipf plot of the number of citations to a given paper versus its citation rank appears to be consistent with a power-law dependence for leading rank papers, with exponent close to -1/2. This, in turn, suggests that the number of papers with x citations, N(x), has a large-x power law decay N(x)~x^{-alpha}, with alpha approximately equal to 3. }, author = {Redner, S.}, doi = {10.1007/s100510050359}, eprint = {arXiv:cond-mat/9804163}, interhash = {cf0b54f3514619888dfce1ec17accde4}, intrahash = {ab39b67b23937efa55b91598c9d7f24b}, journal = {European Physical Journal B}, month = aug, number = 2, pages = {131--134}, title = {How popular is your paper? An empirical study of the citation distribution}, url = {http://arxiv.org/abs/cond-mat/9804163}, volume = 4, year = 1998 } @article{bhattacharya2007collective, abstract = {Many databases contain uncertain and imprecise references to real-world entities. The absence of identifiers for the underlying entities often results in a database which contains multiple references to the same entity. This can lead not only to data redundancy, but also inaccuracies in query processing and knowledge extraction. These problems can be alleviated through the use of entity resolution. Entity resolution involves discovering the underlying entities and mapping each database reference to these entities. Traditionally, entities are resolved using pairwise similarity over the attributes of references. However, there is often additional relational information in the data. Specifically, references to different entities may cooccur. In these cases, collective entity resolution, in which entities for cooccurring references are determined jointly rather than independently, can improve entity resolution accuracy. We propose a novel relational clustering algorithm that uses both attribute and relational information for determining the underlying domain entities, and we give an efficient implementation. We investigate the impact that different relational similarity measures have on entity resolution quality. We evaluate our collective entity resolution algorithm on multiple real-world databases. We show that it improves entity resolution performance over both attribute-based baselines and over algorithms that consider relational information but do not resolve entities collectively. In addition, we perform detailed experiments on synthetically generated data to identify data characteristics that favor collective relational resolution over purely attribute-based algorithms.}, acmid = {1217304}, address = {New York, NY, USA}, articleno = {5}, author = {Bhattacharya, Indrajit and Getoor, Lise}, doi = {10.1145/1217299.1217304}, interhash = {3fdd3dfe026b0f18c7b9927ebe471cf1}, intrahash = {5c65a3d97ac6933ca2f63480630d99cf}, issn = {1556-4681}, issue = {1}, issue_date = {March 2007}, journal = {ACM Transactions on Knowledge Discovery from Data}, month = mar, number = 1, publisher = {ACM}, title = {Collective entity resolution in relational data}, url = {http://doi.acm.org/10.1145/1217299.1217304}, volume = 1, year = 2007 }