@inproceedings{conf/pkdd/ReadPHF09, author = {Read, Jesse and Pfahringer, Bernhard and Holmes, Geoffrey and Frank, Eibe}, booktitle = {ECML/PKDD (2)}, crossref = {conf/pkdd/2009-2}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04174-7_17}, interhash = {d07ad188ba08d6931d30643b849de079}, intrahash = {ab264cc42b2f1530ab6da09aaf5fa0fc}, isbn = {978-3-642-04173-0}, pages = {254-269}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Classifier Chains for Multi-label Classification.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-2.html#ReadPHF09}, volume = 5782, year = 2009 } @inproceedings{conf/pkdd/MaesPDG09, abstract = {Collective classification refers to the classification of interlinked and relational objects described as nodes in a graph. The Iterative Classification Algorithm (ICA) is a simple, efficient and widely used method to solve this problem. It is representative of a family of methods for which inference proceeds as an iterative process: at each step, nodes of the graph are classified according to the current predicted labels of their neighbors. We show that learning in this class of models suffers from a training bias. We propose a new family of methods, called Simulated ICA, which helps reducing this training bias by simulating inference during learning. Several variants of the method are introduced. They are both simple, efficient and scale well. Experiments performed on a series of 7 datasets show that the proposed methods outperform representative state-of-the-art algorithms while keeping a low complexity.}, author = {Maes, Francis and Peters, Stéphane and Denoyer, Ludovic and Gallinari, Patrick}, booktitle = {ECML/PKDD (2)}, crossref = {conf/pkdd/2009-2}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04174-7_4}, interhash = {91c999fb8704c3e4301df8c967a1c711}, intrahash = {6308dba1d66e8118b891c0e75273b0a7}, isbn = {978-3-642-04173-0}, pages = {47-62}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Simulated Iterative Classification A New Learning Procedure for Graph Labeling.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-2.html#MaesPDG09}, volume = 5782, year = 2009 } @article{weiwei2009combining, abstract = {Multilabel classification is an extension of conventional classification in which a single instance can be associated with multiple labels. Recent research has shown that, just like for conventional classification, instance-based learning algorithmsrelying on the nearest neighbor estimation principle can be used quite successfully in this context. However, since hithertoexisting algorithms do not take correlations and interdependencies between labels into account, their potential has not yetbeen fully exploited. In this paper, we propose a new approach to multilabel classification, which is based on a frameworkthat unifies instance-based learning and logistic regression, comprising both methods as special cases. This approach allowsone to capture interdependencies between labels and, moreover, to combine model-based and similarity-based inference for multilabelclassification. As will be shown by experimental studies, our approach is able to improve predictive accuracy in terms ofseveral evaluation criteria for multilabel prediction.}, author = {Cheng, Weiwei and Hüllermeier, Eyke}, interhash = {1f49c2672a44144b7073d3d7e9f82346}, intrahash = {40bd0d294de6c597255ae86dff700230}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {6--6}, title = {Combining Instance-Based Learning and Logistic Regression for Multilabel Classification}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_6}, year = 2009 } @article{christian2009withinnetwork, abstract = {Within-network classification, where the goal is to classify the nodes of a partly labeled network, is a semi-supervised learning problem that has applications in several important domains like image processing, the classification of documents, and thedetection of malicious activities. While most methods for this problem infer the missing labels collectively based on thehypothesis that linked or nearby nodes are likely to have the same labels, there are many types of networks for which thisassumption fails, e.g., molecular graphs, trading networks, etc. In this paper, we present a collective classification method,based on relaxation labeling, that classifies entities of a network using their local structure. This method uses a marginalizedsimilarity kernel that compares the local structure of two nodes with random walks in the network. Through experimentationon different datasets, we show our method to be more accurate than several state-of-the-art approaches for this problem.}, author = {Desrosiers, Christian and Karypis, George}, interhash = {5db04cc3cfea4d9777a55c7c9a44f71c}, intrahash = {fbcbbf5c016ec86fe15591e70f71b66b}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {260--275}, title = {Within-Network Classification Using Local Structure Similarity}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_34}, year = 2009 } @inproceedings{conf/pkdd/MirowskiL09, author = {Mirowski, Piotr W. and LeCun, Yann}, booktitle = {ECML/PKDD (2)}, crossref = {conf/pkdd/2009-2}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04174-7_9}, interhash = {8e22ad8f9308bb7122db75a010c2ff7b}, intrahash = {af7db8d2cbde8d65b60260a54d368bd0}, isbn = {978-3-642-04173-0}, pages = {128-143}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Dynamic Factor Graphs for Time Series Modeling.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-2.html#MirowskiL09}, volume = 5782, year = 2009 } @inproceedings{conf/pkdd/AkogluF09, author = {Akoglu, Leman and Faloutsos, Christos}, booktitle = {ECML/PKDD (1)}, crossref = {conf/pkdd/2009-1}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04180-8_13}, interhash = {41cfe5a9af68deacdf3881536d5f1e0d}, intrahash = {3af4a53fd0b650b6914f89a208bdc753}, isbn = {978-3-642-04179-2}, pages = {13-28}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {RTG: A Recursive Realistic Graph Generator Using Random Typing.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-1.html#AkogluF09}, volume = 5781, year = 2009 } @inproceedings{conf/pkdd/BerlingerioBBG09, author = {Berlingerio, Michele and Bonchi, Francesco and Bringmann, Björn and Gionis, Aristides}, booktitle = {ECML/PKDD (1)}, crossref = {conf/pkdd/2009-1}, date = {2009-08-31}, editor = {Buntine, Wray L. and Grobelnik, Marko and Mladenic, Dunja and Shawe-Taylor, John}, ee = {http://dx.doi.org/10.1007/978-3-642-04180-8_25}, interhash = {a957e8240b0182eae34571c3416bb08c}, intrahash = {595fb646c89246d26cb53323ff8a8464}, isbn = {978-3-642-04179-2}, pages = {115-130}, publisher = {Springer}, series = {Lecture Notes in Computer Science}, title = {Mining Graph Evolution Rules.}, url = {http://dblp.uni-trier.de/db/conf/pkdd/pkdd2009-1.html#BerlingerioBBG09}, volume = 5781, year = 2009 } @article{johannes2009binary, abstract = {Bipartite ranking refers to the problem of learning a ranking function from a training set of positively and negatively labeled examples. Applied to a set of unlabeled instances, a ranking function is expected to establish a total order in which positiveinstances precede negative ones. The performance of a ranking function is typically measured in terms of the AUC. In thispaper, we study the problem of multipartite ranking, an extension of bipartite ranking to the multi-class case. In this regard,we discuss extensions of the AUC metric which are suitable as evaluation criteria for multipartite rankings. Moreover, tolearn multipartite ranking functions, we propose methods on the basis of binary decomposition techniques that have previouslybeen used for multi-class and ordinal classification. We compare these methods both analytically and experimentally, not onlyagainst each other but also to existing methods applicable to the same problem.}, author = {Fürnkranz, Johannes and Hüllermeier, Eyke and Vanderlooy, Stijn}, interhash = {780e00a583e280eebfa4d87cc74e62a1}, intrahash = {472363e85298f4d6e188d92a4319918a}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {359--374}, title = {Binary Decomposition Methods for Multipartite Ranking}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_41}, year = 2009 } @article{jorge2009margin, abstract = {From a multi-class learning task, in addition to a classifier, it is possible to infer some useful knowledge about the relationship between the classes involved. In this paper we propose a method to learn a hierarchical clustering of the set of classes.The usefulness of such clusterings has been exploited in bio-medical applications to find out relations between diseases orpopulations of animals. The method proposed here defines a distance between classes based on the margin maximization principle,and then builds the hierarchy using a linkage procedure. Moreover, to quantify the goodness of the hierarchies we define ameasure. Finally, we present a set of experiments comparing the scores achieved by our approach with other methods.}, author = {Díez, Jorge and del Coz, Juan and Bahamonde, Antonio and Luaces, Oscar}, interhash = {634c6107fd84fc2f0b17bf1559436a89}, intrahash = {fa3f2f8d6a9103c72fd8a32ca0d1e247}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {302--314}, title = {Soft Margin Trees}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_37}, year = 2009 } @article{kelvin2009multiattribute, abstract = {In many real-world applications that analyze correlations between two groups of diverse entities, each group of entities can be characterized by multiple attributes. As such, there is a need to co-cluster multiple attributes’ values into pairs of highly correlated clusters. We denote this co-clustering problem as the multi-attribute co-clustering problem. In this paper, we introduce a generalization of the mutual information between two attributes into mutual informationbetween two attribute sets. The generalized formula enables us to use correlation information to discover multi-attribute co-clusters (MACs). We develop a novel algorithm MACminer to mine MACs with high correlation information from datasets. We demonstrate the miningefficiency of MACminer in datasets with multiple attributes, and show that MACs with high correlation information have higherclassification and predictive power, as compared to MACs generated by alternative high-dimensional data clustering and patternmining techniques.}, author = {Sim, Kelvin and Gopalkrishnan, Vivekanand and Chua, Hon and Ng, See-Kiong}, interhash = {5d3a6eff6a13dc171f20316f3a9670ce}, intrahash = {bbe4260fc922a77c79555f8e9b8120bc}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {398--413}, title = {MACs: Multi-Attribute Co-clusters with High Correlation Information}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_26}, year = 2009 } @article{nguyen2009efficient, abstract = {Outlier detection finds many applications, especially in domains that have scope for abnormal behavior. In this paper, we present a new technique for detecting distance-based outliers, aimed at reducing execution time associated with the detectionprocess. Our approach operates in two phases and employs three pruning rules. In the first phase, we partition the data intoclusters, and make an early estimate on the lower bound of outlier scores. Based on this lower bound, the second phase thenprocesses relevant clusters using the traditional block nested-loop algorithm. Here two efficient pruning rules are utilizedto quickly discard more non-outliers and reduce the search space. Detailed analysis of our approach shows that the additionaloverhead of the first phase is offset by the reduction in cost of the second phase. We also demonstrate the superiority ofour approach over existing distance-based outlier detection methods by extensive empirical studies on real datasets.}, author = {Vu, Nguyen and Gopalkrishnan, Vivekanand}, interhash = {e219b7e66b466cc39f44520b37f91a61}, intrahash = {b33d7b9133cc3d81e507f4366658fb56}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {160--175}, title = {Efficient Pruning Schemes for Distance-Based Outlier Detection}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_11}, year = 2009 } @article{pu2009latent, abstract = {Co-clustering has emerged as an important technique for mining contingency data matrices. However, almost all existing co-clustering algorithms are hard partitioning, assigning each row and column of the data matrix to one cluster. Recently a Bayesian co-clusteringapproach has been proposed which allows a probability distribution membership in row and column clusters. The approach usesvariational inference for parameter estimation. In this work, we modify the Bayesian co-clustering model, and use collapsedGibbs sampling and collapsed variational inference for parameter estimation. Our empirical evaluation on real data sets showsthat both collapsed Gibbs sampling and collapsed variational inference are able to find more accurate likelihood estimatesthan the standard variational Bayesian co-clustering approach.}, author = {Wang, Pu and Domeniconi, Carlotta and Laskey, Kathryn}, interhash = {ca3c6ea6255fd4fa4601502fd55bec24}, intrahash = {0ef1833cdcdf2a7d9093e37894c4f3ab}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {522--537}, title = {Latent Dirichlet Bayesian Co-Clustering}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_34}, year = 2009 } @article{istván2009latent, abstract = {In this paper we introduce and evaluate a technique for applying latent Dirichlet allocation to supervised semantic categorization of documents. In our setup, for every category an own collection of topics is assigned, and for a labeled training documentonly topics from its category are sampled. Thus, compared to the classical LDA that processes the entire corpus in one, weessentially build separate LDA models for each category with the category-specific topics, and then these topic collectionsare put together to form a unified LDA model. For an unseen document the inferred topic distribution gives an estimation howmuch the document fits into the category.}, author = {Bíró, István and Szabó, Jácint}, interhash = {f4c5b12409be4108320cba5b8fd18c45}, intrahash = {2db7477d992284eabea47e1c9669ab5a}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {430--441}, title = {Latent Dirichlet Allocation for Automatic Document Categorization}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_28}, year = 2009 } @article{loulwah2009topic, abstract = {Topic models, like Latent Dirichlet Allocation (LDA), have been recently used to automatically generate text corpora topics, and to subdivide the corpus words among those topics. However, not all the estimated topics are of equal importance or correspondto genuine themes of the domain. Some of the topics can be a collection of irrelevant words, or represent insignificant themes.Current approaches to topic modeling perform manual examination to find meaningful topics. This paper presents the first automatedunsupervised analysis of LDA models to identify junk topics from legitimate ones, and to rank the topic significance. Basically,the distance between a topic distribution and three definitions of “junk distribution” is computed using a variety of measures,from which an expressive figure of the topic significance is implemented using 4-phase Weighted Combination approach. Ourexperiments on synthetic and benchmark datasets show the effectiveness of the proposed approach in ranking the topic significance.}, author = {AlSumait, Loulwah and Barbará, Daniel and Gentle, James and Domeniconi, Carlotta}, interhash = {273b61715108282ac89350ba18f99eb2}, intrahash = {6310cb442c4e7852070e4f631fa2c1fa}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {67--82}, title = {Topic Significance Ranking of LDA Generative Models}, url = {http://dx.doi.org/10.1007/978-3-642-04180-8_22}, year = 2009 } @article{benyah2009identifying, abstract = {One major goal of text mining is to provide automatic methods to help humans grasp the key ideas in ever-increasing text corpora. To this effect, we propose a statistically well-founded method for identifying the original ideas that a document contributesto a corpus, focusing on self-referential diachronic corpora such as research publications, blogs, email, and news articles.Our statistical model of passage impact defines (interesting) original content through a combination of impact and novelty,and the model is used to identify each document’s most original passages. Unlike heuristic approaches, the statistical modelis extensible and open to analysis. We evaluate the approach both on synthetic data and on real data in the domains of researchpublications and news, showing that the passage impact model outperforms a heuristic baseline method.}, author = {Shaparenko, Benyah and Joachims, Thorsten}, interhash = {7db7a26b3d568e8b405a8079b2c067e8}, intrahash = {c50730e7a14f4a0c76305c44fa2eef2e}, journal = {Machine Learning and Knowledge Discovery in Databases}, pages = {350--365}, title = {Identifying the Original Contribution of a Document via Language Modeling}, url = {http://dx.doi.org/10.1007/978-3-642-04174-7_23}, year = 2009 }