Publications
2010
Marc Maier, Brian Taylor, Huseyin Oktay, David Jensen
Learning causal models of relational domains Proceedings Article
In: Proceedings of the Twenty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2010, Atlanta, Georgia, USA, July 11-15, 2010, 2010.
@inproceedings{maier2010learning,
title = {Learning causal models of relational domains},
author = {Marc Maier and Brian Taylor and Huseyin Oktay and David Jensen},
url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI10/paper/view/1919},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the Twenty-Fourth AAAI Conference on Artificial Intelligence,
AAAI 2010, Atlanta, Georgia, USA, July 11-15, 2010},
volume = {24},
number = {1},
abstract = {Methods for discovering causal knowledge from observational data have been a persistent topic of AI research for several decades. Essentially all of this work focuses on knowledge representations for propositional domains. In this paper, we present several key algorithmic and theoretical innovations that extend causal discovery to relational domains. We provide strong evidence that effective learning of causal models is enhanced by relational representations. We present an algorithm, relational PC, that learns causal dependencies in a state-of-the-art relational representation, and we identify the key representational and algorithmic innovations that make the algorithm possible. Finally, we prove the algorithm's theoretical correctness and demonstrate its effectiveness on synthetic and real data sets.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Huseyin Oktay, Brian Taylor, David Jensen
Causal discovery in social media using quasi-experimental designs Proceedings Article
In: Proceedings of the 3rd Workshop on Social Network Mining and Analysis, SNAKDD, pp. 1–9, 2010.
@inproceedings{oktay2010causal,
title = {Causal discovery in social media using quasi-experimental designs},
author = {Huseyin Oktay and Brian Taylor and David Jensen},
url = {https://doi.org/10.1145/1964858.1964859},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 3rd Workshop on Social Network Mining and Analysis,
SNAKDD},
pages = {1--9},
abstract = {Social media systems have become increasingly attractive to both users and companies providing those systems. Efficient management of these systems is essential and requires knowledge of cause-and-effect relationships within the system. Online experimentation can be used to discover causal knowledge; however, this ignores the observational data that is already being collected for operational purposes. Quasi-experimental designs (QEDs) are commonly used in social sciences to discover causal knowledge from observational data, and QEDs can be exploited to discover causal knowledge about social media systems. In this paper, we apply three different QEDs to demonstrate how one can gain a causal understanding of a social media system. The conclusions drawn from using a QED can have threats to their validity, but we show how one can carefully construct sophisticated designs to overcome some of those threats.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Brian Delaney, Andrew Fast, W Campbell, C Weinstein, David Jensen
The application of statistical relational learning to a database of criminal and terrorist activity Proceedings Article
In: Proceedings of the 2010 SIAM International Conference on Data Mining, pp. 409–417, Society for Industrial and Applied Mathematics 2010.
@inproceedings{delaney2010application,
title = {The application of statistical relational learning to a database of criminal and terrorist activity},
author = {Brian Delaney and Andrew Fast and W Campbell and C Weinstein and David Jensen},
url = {https://doi.org/10.1137/1.9781611972801.36},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 2010 SIAM International Conference on Data Mining},
pages = {409--417},
organization = {Society for Industrial and Applied Mathematics},
abstract = {We apply statistical relational learning to a database of criminal and terrorist activity to predict attributes and event outcomes. The database stems from a collection of news articles and court records which are carefully annotated with a variety of variables, including categorical and continuous fields. Manual analysis of this data can help inform decision makers seeking to curb violent activity within a region. We use this data to build relational models from historical data to predict attributes of groups, individuals, or events. Our first example involves predicting social network roles within a group under a variety of different data conditions. Collective classification can be used to boost the accuracy under data poor conditions. Additionally, we were able to predict the outcome of hostage negotiations using models trained on previous kidnapping events. The overall framework and techniques described here are flexible enough to be used to predict a variety of variables. Such predictions could be used as input to a more complex system to recognize intent of terrorist groups or as input to inform human decision makers.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, David Jensen
Leveraging d-separation for relational data sets Proceedings Article
In: ICDM 2010, The 10th IEEE International Conference on Data Mining, pp. 989–994, IEEE 2010.
@inproceedings{rattigan2010leveraging,
title = {Leveraging d-separation for relational data sets},
author = {Matthew Rattigan and David Jensen},
url = {https://doi.org/10.1109/ICDM.2010.142},
year = {2010},
date = {2010-01-01},
booktitle = {ICDM 2010, The 10th IEEE International Conference on Data Mining},
pages = {989--994},
organization = {IEEE},
abstract = {Testing for marginal and conditional independence is a common task in machine learning and knowledge discovery applications. Prior work has demonstrated that conventional independence tests suffer from dramatically increased rates of Type I errors when naively applied to relational data. We use graphical models to specify the conditions under which these errors occur, and use those models to devise novel and accurate conditional independence tests.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2009
Michael Hay, Chao Li, Gerome Miklau, David Jensen
Accurate Estimation of the Degree Distribution of Private Networks Proceedings Article
In: ICDM 2009, The Ninth IEEE International Conference on Data Mining, Miami, Florida, USA, 6-9 December 2009, pp. 169–178, IEEE Computer Society, 2009.
@inproceedings{hay2009accurate,
title = {Accurate Estimation of the Degree Distribution of Private Networks},
author = {Michael Hay and Chao Li and Gerome Miklau and David Jensen},
url = {https://doi.org/10.1109/ICDM.2009.11},
year = {2009},
date = {2009-01-01},
booktitle = {ICDM 2009, The Ninth IEEE International Conference on Data Mining,
Miami, Florida, USA, 6-9 December 2009},
pages = {169--178},
publisher = {IEEE Computer Society},
abstract = {We describe an efficient algorithm for releasing a provably private estimate of the degree distribution of a network. The algorithm satisfies a rigorous property of differential privacy, and is also extremely efficient, running on networks of 100 million nodes in a few seconds. Theoretical analysis shows that the error scales linearly with the number of unique degrees, whereas the error of conventional techniques scales linearly with the number of nodes. We complement the theoretical analysis with a thorough empirical analysis on real and synthetic graphs, showing that the algorithm’s variance and bias is low, that the error diminishes as the size of the input graph increases, and that common analyses like fitting a power-law can be carried out very accurately.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, David Jensen
Constraint relaxation for learning the structure of Bayesian networks Technical Report
Tech Report 09-18, University of Massachusetts Amherst, Computer Science~… 2009.
@techreport{fast2009constraint,
title = {Constraint relaxation for learning the structure of Bayesian networks},
author = {Andrew Fast and David Jensen},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.217.7971&rep=rep1&type=pdf},
year = {2009},
date = {2009-01-01},
institution = {Tech Report 09-18, University of Massachusetts Amherst, Computer Science~…},
abstract = {This paper introduces constraint relaxation, a new strategy for learning the structure of Bayesian networks. Constraint relaxation identifies and “relaxes” possibly inaccurate independence constraints on the structure of the model. We describe a heuristic algorithm for constraint relaxation that combines greedy search in the space of undirected skeletons with edge orientation based on the constraints. This approach produces significant improvements in the structural accuracy of the learned models compared to four well-known structure learning algorithms in an empirical evaluation using data sampled from both real-world and randomly generated networks.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
2008
Michael Hay, Gerome Miklau, David Jensen, Don Towsley, Philipp Weis
Resisting structural re-identification in anonymized social networks Journal Article
In: Proceedings of the VLDB Endowment, vol. 1, no. 1, pp. 102–114, 2008.
@article{hay2008resisting,
title = {Resisting structural re-identification in anonymized social networks},
author = {Michael Hay and Gerome Miklau and David Jensen and Don Towsley and Philipp Weis},
url = {https://dl.acm.org/doi/pdf/10.14778/1453856.1453873},
year = {2008},
date = {2008-01-01},
journal = {Proceedings of the VLDB Endowment},
volume = {1},
number = {1},
pages = {102--114},
publisher = {VLDB Endowment},
abstract = {We identify privacy risks associated with releasing network data sets and provide an algorithm that mitigates those risks. A network consists of entities connected by links representing relations such as friendship, communication, or shared activity. Maintaining privacy when publishing networked data is uniquely challenging because an individual's network context can be used to identify them even if other identifying information is removed. In this paper, we quantify the privacy risks associated with three classes of attacks on the privacy of individuals in networks, based on the knowledge used by the adversary. We show that the risks of these attacks vary greatly based on network structure and size. We propose a novel approach to anonymizing network data that models aggregate network structure and then allows samples to be drawn from that model. The approach guarantees anonymity for network entities while preserving the ability to estimate a wide variety of network measures with relatively little bias.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Ozgur Simsek, David Jensen
Navigating networks by using homophily and degree Journal Article
In: Proceedings of the National Academy of Sciences, vol. 105, no. 35, pp. 12758–12762, 2008.
@article{csimcsek2008navigating,
title = {Navigating networks by using homophily and degree},
author = {Ozgur Simsek and David Jensen},
url = {https://www.pnas.org/content/pnas/105/35/12758.full.pdf},
year = {2008},
date = {2008-01-01},
journal = {Proceedings of the National Academy of Sciences},
volume = {105},
number = {35},
pages = {12758--12762},
publisher = {National Academy of Sciences},
abstract = {Many large distributed systems can be characterized as networks where short paths exist between nearly every pair of nodes. These include social, biological, communication, and distribution networks, which often display power-law or small-world structure. A central challenge of distributed systems is directing messages to specific nodes through a sequence of decisions made by individual nodes without global knowledge of the network. We present a probabilistic analysis of this navigation problem that produces a surprisingly simple and effective method for directing messages. This method requires calculating only the product of the two measures widely used to summarize all local information. It outperforms prior approaches reported in the literature by a large margin, and it provides a formal model that may describe how humans make decisions in sociological studies intended to explore the social network as well as how they make decisions in more naturalistic settings.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Jennifer Neville, David Jensen
A bias/variance decomposition for models using collective inference Journal Article
In: Machine Learning, vol. 73, no. 1, pp. 87–106, 2008.
@article{neville2008bias,
title = {A bias/variance decomposition for models using collective inference},
author = {Jennifer Neville and David Jensen},
url = {https://doi.org/10.1007/s10994-008-5066-6},
year = {2008},
date = {2008-01-01},
journal = {Machine Learning},
volume = {73},
number = {1},
pages = {87--106},
publisher = {Springer US},
abstract = {Bias/variance analysis is a useful tool for investigating the performance of machine learning algorithms. Conventional analysis decomposes loss into errors due to aspects of the learning process, but in relational domains, the inference process used for prediction introduces an additional source of error. Collective inference techniques introduce additional error, both through the use of approximate inference algorithms and through variation in the availability of test-set information. To date, the impact of inference error on model performance has not been investigated. We propose a new bias/variance framework that decomposes loss into errors due to both the learning and inference processes. We evaluate the performance of three relational models on both synthetic and real-world datasets and show that (1) inference can be a significant source of error, and (2) the models exhibit different types of errors as data characteristics are varied.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Amy McGovern, David Jensen
Optimistic pruning for multiple instance learning Journal Article
In: Pattern recognition letters, vol. 29, no. 9, pp. 1252–1260, 2008.
@article{mcgovern2008optimistic,
title = {Optimistic pruning for multiple instance learning},
author = {Amy McGovern and David Jensen},
url = {https://doi.org/10.1016/j.patrec.2008.01.024},
year = {2008},
date = {2008-01-01},
journal = {Pattern recognition letters},
volume = {29},
number = {9},
pages = {1252--1260},
publisher = {North-Holland},
abstract = {This paper introduces a simple evaluation function for multiple instance learning that admits an optimistic pruning strategy. We demonstrate comparable results to state-of-the-art methods using significantly fewer computational resources.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Jensen, Andrew Fast, Brian Taylor, Marc Maier
Automatic identification of quasi-experimental designs for discovering causal knowledge Proceedings Article
In: Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Las Vegas, Nevada, USA, August 24-27, 2008, pp. 372–380, 2008.
@inproceedings{jensen2008automatic,
title = {Automatic identification of quasi-experimental designs for discovering causal knowledge},
author = {David Jensen and Andrew Fast and Brian Taylor and Marc Maier},
url = {https://doi.org/10.1145/1401890.1401938},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, Las Vegas, Nevada, USA, August
24-27, 2008},
pages = {372--380},
abstract = {Researchers in the social and behavioral sciences routinely rely on quasi-experimental designs to discover knowledge from large data-bases. Quasi-experimental designs (QEDs) exploit fortuitous circumstances in non-experimental data to identify situations (sometimes called "natural experiments") that provide the equivalent of experimental control and randomization. QEDs allow researchers in domains as diverse as sociology, medicine, and marketing to draw reliable inferences about causal dependencies from non-experimental data. Unfortunately, identifying and exploiting QEDs has remained a painstaking manual activity, requiring researchers to scour available databases and apply substantial knowledge of statistics. However, recent advances in the expressiveness of databases, and increases in their size and complexity, provide the necessary conditions to automatically identify QEDs. In this paper, we describe the first system to discover knowledge by applying quasi-experimental designs that were identified automatically. We demonstrate that QEDs can be identified in a traditional database schema and that such identification requires only a small number of extensions to that schema, knowledge about quasi-experimental design encoded in first-order logic, and a theorem-proving engine. We describe several key innovations necessary to enable this system, including methods for automatically constructing appropriate experimental units and for creating aggregate variables on those units. We show that applying the resulting designs can identify important causal dependencies in real domains, and we provide examples from academic publishing, movie making and marketing, and peer-production systems. Finally, we discuss the integration of QEDs with other approaches to causal discovery, including joint modeling and directed experimentation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, David Jensen
Why stacked models perform effective collective classification Proceedings Article
In: Proceedings of the 8th IEEE International Conference on Data Mining (ICDM 2008), December 15-19, 2008, Pisa, Italy, pp. 785–790, IEEE Computer Society, 2008.
@inproceedings{fast2008stacked,
title = {Why stacked models perform effective collective classification},
author = {Andrew Fast and David Jensen},
url = {https://doi.org/10.1109/ICDM.2008.126},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 8th IEEE International Conference on Data Mining
(ICDM 2008), December 15-19, 2008, Pisa, Italy},
pages = {785--790},
publisher = {IEEE Computer Society},
abstract = {Collective classification techniques jointly infer all class labels of a relational data set, using the inferences about one class label to influence inferences about related class labels. Kou and Cohen recently introduced an efficient relational model based on stacking that, despite its simplicity, has equivalent accuracy to more sophisticated joint inference approaches. Using experiments on both real and synthetic data, we show that the primary cause for the performance of the stacked model is the reduction in bias from learning the stacked model on inferred labels rather than true labels. The reduction in variance due to conditional inference also contributes to the effect but it is not as strong. In addition, we show that the performance of the joint inference and stacked learners can be attributed to an implicit weighting of local and relational features at learning time.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Jensen, Andrew Fast, Brian Taylor, Marc Maier
Automatic Identification of Quasi-Experimental Designs for Discovering Causal Knowledge Proceedings Article
In: Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 372–380, Association for Computing Machinery, Las Vegas, Nevada, USA, 2008, ISBN: 9781605581934.
@inproceedings{10.1145/1401890.1401938,
title = {Automatic Identification of Quasi-Experimental Designs for Discovering Causal Knowledge},
author = {David Jensen and Andrew Fast and Brian Taylor and Marc Maier},
url = {https://doi.org/10.1145/1401890.1401938},
doi = {10.1145/1401890.1401938},
isbn = {9781605581934},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {372–380},
publisher = {Association for Computing Machinery},
address = {Las Vegas, Nevada, USA},
series = {KDD '08},
abstract = {Researchers in the social and behavioral sciences routinely rely on quasi-experimental designs to discover knowledge from large data-bases. Quasi-experimental designs (QEDs) exploit fortuitous circumstances in non-experimental data to identify situations (sometimes called "natural experiments") that provide the equivalent of experimental control and randomization. QEDs allow researchers in domains as diverse as sociology, medicine, and marketing to draw reliable inferences about causal dependencies from non-experimental data. Unfortunately, identifying and exploiting QEDs has remained a painstaking manual activity, requiring researchers to scour available databases and apply substantial knowledge of statistics. However, recent advances in the expressiveness of databases, and increases in their size and complexity, provide the necessary conditions to automatically identify QEDs. In this paper, we describe the first system to discover knowledge by applying quasi-experimental designs that were identified automatically. We demonstrate that QEDs can be identified in a traditional database schema and that such identification requires only a small number of extensions to that schema, knowledge about quasi-experimental design encoded in first-order logic, and a theorem-proving engine. We describe several key innovations necessary to enable this system, including methods for automatically constructing appropriate experimental units and for creating aggregate variables on those units. We show that applying the resulting designs can identify important causal dependencies in real domains, and we provide examples from academic publishing, movie making and marketing, and peer-production systems. Finally, we discuss the integration of QEDs with other approaches to causal discovery, including joint modeling and directed experimentation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, Michael Hay, David Jensen
Improving accuracy of constraint-based structure learning Technical Report
Technical report 08-48, University of Massachusetts Amherst, Computer~… 2008.
@techreport{fast2008improving,
title = {Improving accuracy of constraint-based structure learning},
author = {Andrew Fast and Michael Hay and David Jensen},
url = {https://www.researchgate.net/profile/David-Jensen-10/publication/228854891_Improving_Accuracy_of_Constraint-Based_Structure_Learning/links/09e41510892d741c18000000/Improving-Accuracy-of-Constraint-Based-Structure-Learning.pdf},
year = {2008},
date = {2008-01-01},
institution = {Technical report 08-48, University of Massachusetts Amherst, Computer~…},
abstract = {Hybrid algorithms for learning the structure of Bayesian networks combine techniques from both the constraintbased and search-and-score paradigms of structure learning. One class of hybrid approaches uses a constraintbased algorithm to learn an undirected skeleton identifying edges that should appear in the final network. This skeleton is used to constrain the model space considered by a search-and-score algorithm to orient the edges and produce a final model structure. At small sample sizes, the performance of models learned using this hybrid approach do not achieve likelihood as high as models learned by unconstrained search. Low performance is a result of errors made by the skeleton identification algorithm, particularly false negative errors, which lead to an over-constrained search space. These errors are often attributed to “noisy” hypothesis tests that are run during skeleton identification. However, at least three specific sources of error have been identified in the literature: unsuitable hypothesis tests, lowpower hypothesis tests, and unexplained d-separation. No previous work has considered these sources of error in combination. We determine the relative importance of each source individually and in combination. We identify that low-power tests are the primary source of false negative errors, and show that these errors can be corrected by a novel application of statistical power analysis. The result is a new hybrid algorithm for learning the structure of Bayesian networks which produces models with equivalent likelihood to models produced by unconstrained greedy search, using only a fraction of the time.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
2007
Jennifer Neville, David Jensen
Relational Dependency Networks Journal Article
In: J. Mach. Learn. Res., vol. 8, pp. 653–692, 2007.
@article{DBLP:journals/jmlr/NevilleJ07,
title = {Relational Dependency Networks},
author = {Jennifer Neville and David Jensen},
url = {http://dl.acm.org/citation.cfm?id=1314522},
year = {2007},
date = {2007-01-01},
journal = {J. Mach. Learn. Res.},
volume = {8},
pages = {653--692},
abstract = {Recent work on graphical models for relational data has demonstrated significant improvements in classification and inference when models represent the dependencies among instances. Despite its use in conventional statistical models, the assumption of instance independence is contradicted by most relational data sets. For example, in citation data there are dependencies among the topics of a paper’s references, and in genomic data there are dependencies among the functions of interacting proteins. In this paper, we present relational dependency networks (RDNs), graphical models that are capable of expressing and reasoning with such dependencies in a relational setting. We discuss RDNs in the context of relational Bayes networks and relational Markov networks and outline the relative strengths of RDNs—namely, the ability to represent cyclic dependencies, simple methods for parameter estimation, and efficient structure learning techniques. The strengths of RDNs are due to the use of pseudolikelihood learning techniques, which estimate an efficient approximation of the full joint distribution. We present learned RDNsfor a number of real-world data sets and evaluate the models in a prediction context, showing that RDNs identify and exploit cyclic relational dependencies to achieve significant performance gains over conventional conditional models. In addition, we use synthetic data to explore model performance under various relational data characteristics, showing that RDN learning and inference techniques are accurate over a wide range of conditions.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Michael Hay, Gerome Miklau, David Jensen, Philipp Weis, Siddharth Srivastava
Anonymizing social networks Journal Article
In: Computer science department faculty publication series, pp. 180, 2007.
@article{hay2007anonymizing,
title = {Anonymizing social networks},
author = {Michael Hay and Gerome Miklau and David Jensen and Philipp Weis and Siddharth Srivastava},
url = {https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1175&context=cs_faculty_pubs},
year = {2007},
date = {2007-01-01},
journal = {Computer science department faculty publication series},
pages = {180},
abstract = {Advances in technology have made it possible to collect data about individuals and the connections between them, such as email correspondence and friendships. Agencies and researchers who have collected such social network data often have a compelling interest in allowing others to analyze the data. However, in many cases the data describes relationships that are private (e.g., email correspondence) and sharing the data in full can result in unacceptable disclosures. In this paper, we present a framework for assessing the privacy risk of sharing anonymized network data. This includes a model of adversary knowledge, for which we consider several variants and make connections to known graph theoretical results. On several real-world social networks, we show that simple anonymization techniques are inadequate, resulting in substantial breaches of privacy for even modestly informed adversaries. We propose a novel anonymization technique based on perturbing the network and demonstrate empirically that it leads to substantial reduction of the privacy threat. We also analyze the effect that anonymizing the network has on the utility of the data for social network analysis.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Michael Hay, Andrew Fast, David Jensen
Understanding the effects of search constraints on structure learning Journal Article
In: U Mass. Amherst CS, Tech. Rep, pp. 07–21, 2007.
@article{hay2007understanding,
title = {Understanding the effects of search constraints on structure learning},
author = {Michael Hay and Andrew Fast and David Jensen},
url = {https://kdl.cs.umass.edu/papers/hay-et-al-tr0721.pdf},
year = {2007},
date = {2007-01-01},
journal = {U Mass. Amherst CS, Tech. Rep},
pages = {07--21},
abstract = {Recently, Tsamardinos et al. [2006] presented an algorithm for Bayesian network structure learning that outperforms many state-of-the-art algorithms in terms of efficiency, structure similarity and likelihood. The Max-Min Hill Climbing algorithm is a hybrid of constraint-based and search-and-score techniques, using greedy hill climbing to search a constrained space of possible network structures. The constraints correspond to assertions of conditional independence that must hold in the network from which the data were sampled. One would expect that constraining the space would make search both faster and more accurate, focusing search on the “right” part of the space. The published results indicate, however, that the resulting structures are less accurate when search is constrained. We reproduce these results and explain why they occur. At small samples, the statistical test of conditional independence has low power, which causes the algorithm to exclude edges between dependent variables. Also, the constraints make search relatively harder, leading to errors in edge orientation. In an unconstrained space, search can “repair” these errors by adding in extra edges. We conclude by proposing and evaluating an improved algorithm.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Matthew Rattigan, Marc Maier, David Jensen
Graph clustering with network structure indices Proceedings Article
In: Machine Learning, Proceedings of the Twenty-Fourth International Conference (ICML 2007), Corvallis, Oregon, USA, June 20-24, 2007, pp. 783–790, ACM, 2007.
@inproceedings{DBLP:conf/icml/RattiganMJ07,
title = {Graph clustering with network structure indices},
author = {Matthew Rattigan and Marc Maier and David Jensen},
url = {https://doi.org/10.1145/1273496.1273595},
doi = {10.1145/1273496.1273595},
year = {2007},
date = {2007-01-01},
booktitle = {Machine Learning, Proceedings of the Twenty-Fourth International Conference
(ICML 2007), Corvallis, Oregon, USA, June 20-24, 2007},
volume = {227},
pages = {783--790},
publisher = {ACM},
series = {ACM International Conference Proceeding Series},
abstract = {Graph clustering has become ubiquitous in the study of relational data sets. We examine two simple algorithms: a new graphical adaptation of the k-medoids algorithm and the Girvan-Newman method based on edge betweenness centrality. We show that they can be effective at discovering the latent groups or communities that are defined by the link structure of a graph. However, both approaches rely on prohibitively expensive computations, given the size of modern relational data sets. Network structure indices (NSIs) are a proven technique for indexing network structure and efficiently finding short paths. We show how incorporating NSIs into these graph clustering algorithms can overcome these complexity limitations. We also present promising quantitative and qualitative evaluations of the modified algorithms on synthetic and real data sets.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Trevor Strohman, W. Bruce Croft, David Jensen
Recommending citations for academic papers Proceedings Article
In: SIGIR 2007: Proceedings of the 30th Annual International ACM SIGIR Conference on Research and Development in Information Retrieval, Amsterdam, The Netherlands, July 23-27, 2007, pp. 705–706, ACM, 2007.
@inproceedings{DBLP:conf/sigir/StrohmanCJ07,
title = {Recommending citations for academic papers},
author = {Trevor Strohman and W. Bruce Croft and David Jensen},
url = {https://doi.org/10.1145/1277741.1277868},
doi = {10.1145/1277741.1277868},
year = {2007},
date = {2007-01-01},
booktitle = {SIGIR 2007: Proceedings of the 30th Annual International ACM SIGIR
Conference on Research and Development in Information Retrieval, Amsterdam,
The Netherlands, July 23-27, 2007},
pages = {705--706},
publisher = {ACM},
abstract = {We approach the problem of academic literature search by considering an unpublished manuscript as a query to a search system. We use the text of previous literature as well as the citation graph that connects it to find relevant related material. We evaluate our technique with manual and automatic evaluation methods, and find an order of magnitude improvement in mean average precision as compared to a text similarity baseline.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Lisa Friedland, David Jensen
Finding tribes: identifying close-knit individuals from employment patterns Proceedings Article
In: Proceedings of the 13th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, San Jose, California, USA, August 12-15, 2007, pp. 290–299, ACM, 2007.
@inproceedings{DBLP:conf/kdd/FriedlandJ07,
title = {Finding tribes: identifying close-knit individuals from employment
patterns},
author = {Lisa Friedland and David Jensen},
url = {https://doi.org/10.1145/1281192.1281226},
doi = {10.1145/1281192.1281226},
year = {2007},
date = {2007-01-01},
booktitle = {Proceedings of the 13th ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, San Jose, California, USA, August
12-15, 2007},
pages = {290--299},
publisher = {ACM},
abstract = {We present a family of algorithms to uncover tribes-groups of individuals who share unusual sequences of affiliations. While much work inferring community structure describes large-scale trends, we instead search for small groups of tightly linked individuals who behave anomalously with respect to those trends. We apply the algorithms to a large temporal and relational data set consisting of millions of employment records from the National Association of Securities Dealers. The resulting tribes contain individuals at higher risk for fraud, are homogenous with respect to risk scores, and are geographically mobile, all at significant levels compared to random or to other sets of individuals who share affiliations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, Marc Maier, David Jensen, Bin Wu, Xin Pei, Jianbin Tan, Yi Wang
Exploiting Network Structure for Active Inference in Collective Classification Proceedings Article
In: Workshops Proceedings of the 7th IEEE International Conference on Data Mining (ICDM 2007), October 28-31, 2007, Omaha, Nebraska, USA, pp. 429–434, IEEE Computer Society, 2007.
@inproceedings{DBLP:conf/icdm/RattiganMJWPTW07,
title = {Exploiting Network Structure for Active Inference in Collective Classification},
author = {Matthew Rattigan and Marc Maier and David Jensen and Bin Wu and Xin Pei and Jianbin Tan and Yi Wang},
url = {https://doi.org/10.1109/ICDMW.2007.124},
doi = {10.1109/ICDMW.2007.124},
year = {2007},
date = {2007-01-01},
booktitle = {Workshops Proceedings of the 7th IEEE International Conference on
Data Mining (ICDM 2007), October 28-31, 2007, Omaha, Nebraska, USA},
pages = {429--434},
publisher = {IEEE Computer Society},
abstract = {Active inference seeks to maximize classification performance while minimizing the amount of data that must be labeled ex ante. This task is particularly relevant in the context of relational data, where statistical dependencies among instances can be exploited to improve classification accuracy. We show that efficient methods for indexing network structure can be exploited to select high-value nodes for labeling. This approach substantially outperforms random selection and selection based on simple measures of local structure. We demonstrate the relative effectiveness of this selection approach through experiments with a relational neighbor classifier on a variety of real and synthetic data sets, and identify the necessary characteristics of the data set that allow this approach to perform well.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, Lisa Friedland, Marc Maier, Brian Taylor, David Jensen, Henry G. Goldberg, John Komoroske
Relational data pre-processing techniques for improved securities fraud detection Proceedings Article
In: Proceedings of the 13th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, San Jose, California, USA, August 12-15, 2007, pp. 941–949, ACM, 2007.
@inproceedings{DBLP:conf/kdd/FastFMTJGK07,
title = {Relational data pre-processing techniques for improved securities
fraud detection},
author = {Andrew Fast and Lisa Friedland and Marc Maier and Brian Taylor and David Jensen and Henry G. Goldberg and John Komoroske},
url = {https://doi.org/10.1145/1281192.1281293},
doi = {10.1145/1281192.1281293},
year = {2007},
date = {2007-01-01},
booktitle = {Proceedings of the 13th ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, San Jose, California, USA, August
12-15, 2007},
pages = {941--949},
publisher = {ACM},
abstract = {Commercial datasets are often large, relational, and dynamic. They contain many records of people, places, things, events and their interactions over time. Such datasets are rarely structured appropriately for knowledge discovery, and they often contain variables whose meanings change across different subsets of the data. We describe how these challenges were addressed in a collaborative analysis project undertaken by the University of Massachusetts Amherst and the National Association of Securities Dealers(NASD). We describe several methods for data pre-processing that we applied to transform a large, dynamic, and relational dataset describing nearly the entirety of the U.S. securities industry, and we show how these methods made the dataset suitable for learning statistical relational models. To better utilize social structure, we first applied known consolidation and link formation techniques to associate individuals with branch office locations. In addition, we developed an innovative technique to infer professional associations by exploiting dynamic employment histories. Finally, we applied normalization techniques to create a suitable class label that adjusts for spatial, temporal, and other heterogeneity within the data. We show how these pre-processing techniques combine to provide the necessary foundation for learning high-performing statistical models of fraudulent activity.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Jensen
Beyond Prediction: Directions for Probabilistic and Relational Learning Proceedings Article
In: Inductive Logic Programming, 17th International Conference, ILP 2007, Corvallis, OR, USA, June 19-21, 2007, Revised Selected Papers, pp. 4–21, Springer, 2007.
@inproceedings{DBLP:conf/ilp/Jensen07,
title = {Beyond Prediction: Directions for Probabilistic and Relational Learning},
author = {David Jensen},
url = {https://doi.org/10.1007/978-3-540-78469-2_2},
doi = {10.1007/978-3-540-78469-2_2},
year = {2007},
date = {2007-01-01},
booktitle = {Inductive Logic Programming, 17th International Conference, ILP
2007, Corvallis, OR, USA, June 19-21, 2007, Revised Selected Papers},
volume = {4894},
pages = {4--21},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
abstract = {Research over the past several decades in learning logical and probabilistic models has greatly increased the range of phenomena that machine learning can address. Recent work has extended these boundaries even further by unifying these two powerful learning frameworks. However, new frontiers await. Current techniques are capable of learning only a subset of the knowledge needed by practitioners in important domains, and further unification of probabilistic and logical learning offers a unique ability to produce the full range of knowledge needed in a wide range of applications.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, David Jensen
Bias/Variance Analysis for Relational Domains Proceedings Article
In: Inductive Logic Programming, 17th International Conference, ILP 2007, Corvallis, OR, USA, June 19-21, 2007, Revised Selected Papers, pp. 27–28, Springer, 2007.
@inproceedings{DBLP:conf/ilp/NevilleJ07,
title = {Bias/Variance Analysis for Relational Domains},
author = {Jennifer Neville and David Jensen},
url = {https://doi.org/10.1007/978-3-540-78469-2_6},
doi = {10.1007/978-3-540-78469-2_6},
year = {2007},
date = {2007-01-01},
booktitle = {Inductive Logic Programming, 17th International Conference, ILP
2007, Corvallis, OR, USA, June 19-21, 2007, Revised Selected Papers},
volume = {4894},
pages = {27--28},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
abstract = {Bias/variance analysis is a useful tool for investigating the performance of machine learning algorithms. Conventional analysis decomposes loss into errors due to aspects of the learning process with an underlying assumption that there is no variation in model predictions due to the inference process used for prediction. This assumption is often violated when collective inference models are used for classification of relational data. In relational data, when there are dependencies among the class labels of related instances, the inferences about one object can be used to improve the inferences about other related objects. Collective inference techniques exploit these dependencies by jointly inferring the class labels in a test set. This approach can produce more accurate predictions than conditional inference for each instance independently, but it also introduces an additional source of error, both through the use of approximate inference algorithms and through variation in the availability of test set information. To date, the impact of inference error on relational model performance has not been investigated.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2006
Hendrik Blockeel, David Jensen, Stefan Kramer
Introduction to the special issue on multi-relational data mining and statistical relational learning Journal Article
In: Mach. Learn., vol. 62, no. 1-2, pp. 3–5, 2006.
@article{DBLP:journals/ml/BlockeelJK06,
title = {Introduction to the special issue on multi-relational data mining
and statistical relational learning},
author = {Hendrik Blockeel and David Jensen and Stefan Kramer},
url = {https://doi.org/10.1007/s10994-006-5856-7},
doi = {10.1007/s10994-006-5856-7},
year = {2006},
date = {2006-01-01},
journal = {Mach. Learn.},
volume = {62},
number = {1-2},
pages = {3--5},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Aaron M Ellison, Leon J Osterweil, Lori Clarke, Julian L Hadley, Alexander Wise, Emery Boose, David R Foster, Allen Hanson, David Jensen, Paul Kuzeja, others
Analytic webs support the synthesis of ecological data sets Journal Article
In: Ecology, vol. 87, no. 6, pp. 1345–1358, 2006.
@article{ellison2006analytic,
title = {Analytic webs support the synthesis of ecological data sets},
author = {Aaron M Ellison and Leon J Osterweil and Lori Clarke and Julian L Hadley and Alexander Wise and Emery Boose and David R Foster and Allen Hanson and David Jensen and Paul Kuzeja and others},
url = {https://esajournals.onlinelibrary.wiley.com/doi/pdfdirect/10.1890/0012-9658%282006%2987%5B1345%3AAWSTSO%5D2.0.CO%3B2},
year = {2006},
date = {2006-01-01},
journal = {Ecology},
volume = {87},
number = {6},
pages = {1345--1358},
publisher = {Wiley Online Library},
abstract = {A wide variety of data sets produced by individual investigators are now synthesized to address ecological questions that span a range of spatial and temporal scales. It is important to facilitate such syntheses so that "consumers" of data sets can be confident that both input data sets and synthetic products are reliable. Necessary documentation to ensure the reliability and validation of data sets includes both familiar descriptive metadata and formal documentation of the scientific processes used (i.e., process metadata) to produce usable data sets from collections of raw data. Such documentation is complex and difficult to construct, so it is important to help "producers" create reliable data sets and to facilitate their creation of required metadata. We describe a formal representation, an "analytic web," that aids both producers and consumers of data sets by providing complete and precise definitions of scientific processes used to process raw and derived data sets. The formalisms used to define analytic webs are adaptations of those used in software engineering, and they provide a novel and effective support system for both the synthesis and the validation of ecological data sets. We illustrate the utility of an analytic web as an aid to producing synthetic data sets through a worked example: the synthesis of long-term measurements of whole-ecosystem carbon exchange. Analytic webs are also useful validation aids for consumers because they support the concurrent construction of a complete, Internet-accessible audit trail of the analytic processes used in the synthesis of the data sets. Finally we describe our early efforts to evaluate these ideas through the use of a prototype software tool, SciWalker. We indicate how this tool has been used to create analytic webs tailored to specific data-set synthesis and validation activities, and suggest extensions to it that will support additional forms of validation. The process metadata created by SciWalker is readily adapted for inclusion in Ecological Metadata Language (EML) files.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
John Burgess, Brian Gallagher, David Jensen, Brian Neil Levine
MaxProp: Routing for Vehicle-Based Disruption-Tolerant Networks Proceedings Article
In: INFOCOM 2006. 25th IEEE International Conference on Computer Communications, Joint Conference of the IEEE Computer and Communications Societies, 23-29 April 2006, Barcelona, Catalunya, Spain, IEEE, 2006.
@inproceedings{DBLP:conf/infocom/BurgessGJL06,
title = {MaxProp: Routing for Vehicle-Based Disruption-Tolerant Networks},
author = {John Burgess and Brian Gallagher and David Jensen and Brian Neil Levine},
url = {https://doi.org/10.1109/INFOCOM.2006.228},
doi = {10.1109/INFOCOM.2006.228},
year = {2006},
date = {2006-01-01},
booktitle = {INFOCOM 2006. 25th IEEE International Conference on Computer Communications,
Joint Conference of the IEEE Computer and Communications Societies,
23-29 April 2006, Barcelona, Catalunya, Spain},
publisher = {IEEE},
abstract = {Disruption-tolerant networks (DTNs) attempt to route network messages via intermittently connected nodes. Routing in such environments is difficult because peers have little information about the state of the partitioned network and transfer opportunities between peers are of limited duration. In this paper, we propose MaxProp, a protocol for effective routing of DTN messages. MaxProp is based on prioritizing both the schedule of packets transmitted to other peers and the schedule of packets to be dropped. These priorities are based on the path likelihoods to peers according to historical data and also on several complementary mechanisms, including acknowledgments, a head-start for new packets, and lists of previous intermediaries. Our evaluations show that MaxProp performs better than protocols that have access to an oracle that knows the schedule of meetings between peers. Our evaluations are based on 60 days of traces from a real DTN network we have deployed on 30 buses. Our network, called UMassDieselNet, serves a large geographic area between five colleges. We also evaluate MaxProp on simulated topologies and show it performs well in a wide variety of DTN environments.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, Marc Maier, David Jensen
Using structure indices for efficient approximation of network properties Proceedings Article
In: Proceedings of the Twelfth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Philadelphia, PA, USA, August 20-23, 2006, pp. 357–366, ACM, 2006.
@inproceedings{DBLP:conf/kdd/RattiganMJ06,
title = {Using structure indices for efficient approximation of network properties},
author = {Matthew Rattigan and Marc Maier and David Jensen},
url = {https://doi.org/10.1145/1150402.1150443},
doi = {10.1145/1150402.1150443},
year = {2006},
date = {2006-01-01},
booktitle = {Proceedings of the Twelfth ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining, Philadelphia, PA, USA, August
20-23, 2006},
pages = {357--366},
publisher = {ACM},
abstract = {Statistics on networks have become vital to the study of relational data drawn from areas such as bibliometrics, fraud detection, bioinformatics, and the Internet. Calculating many of the most important measures - such as betweenness centrality, closeness centrality, and graph diameter-requires identifying short paths in these networks. However, finding these short paths can be intractable for even moderate-size networks. We introduce the concept of a network structure index (NSI), a composition of (1) a set of annotations on every node in the network and (2) a function that uses the annotations to estimate graph distance between pairs of nodes. We present several varieties of NSIs, examine their time and space complexity, and analyze their performance on synthetic and real data sets. We show that creating an NSI for a given network enables extremely efficient and accurate estimation of a wide variety of network statistics on that network.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Chirag Shah, W. Bruce Croft, David Jensen
Representing documents with named entities for story link detection (SLD) Proceedings Article
In: Proceedings of the 2006 ACM CIKM International Conference on Information and Knowledge Management, Arlington, Virginia, USA, November 6-11, 2006, pp. 868–869, ACM, 2006.
@inproceedings{DBLP:conf/cikm/ShahCJ06,
title = {Representing documents with named entities for story link detection
(SLD)},
author = {Chirag Shah and W. Bruce Croft and David Jensen},
url = {https://doi.org/10.1145/1183614.1183771},
doi = {10.1145/1183614.1183771},
year = {2006},
date = {2006-01-01},
booktitle = {Proceedings of the 2006 ACM CIKM International Conference on Information
and Knowledge Management, Arlington, Virginia, USA, November 6-11,
2006},
pages = {868--869},
publisher = {ACM},
abstract = {Several information organization, access, and filtering systems can benefit from different kind of document representations than those used in traditional Information Retrieval (IR). Topic Detection and Tracking (TDT) is an example of such an application. In this paper we demonstrate that named entities serve as better choices of units for document representation over all words. In order to test this hypothesis we study the effect of words-based and entity-based representations on Story Link Detection (SLD) - a core task in TDT research. The experiments on TDT corpora show that entity-based representations give significant improvements for SLD. We also propose a mechanism to expand the set of named entities used for document representation, which enhances the performance in some cases. We then take a step further and analyze the limitations of using only named entities for the document representation. Our studies and experiments indicate that adding additional topical terms can help in addressing such limitations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, David Jensen
The NFL Coaching Network: Analysis of the Social Network among Professional Football Coaches Proceedings Article
In: Capturing and Using Patterns for Evidence Detection, Papers from the 2006 AAAI Fall Symposium, Washington, DC, USA, October 13-15, 2006, pp. 112–119, AAAI Press, 2006.
@inproceedings{DBLP:conf/aaaifs/FastJ06,
title = {The NFL Coaching Network: Analysis of the Social Network among Professional
Football Coaches},
author = {Andrew Fast and David Jensen},
url = {https://www.aaai.org/Library/Symposia/Fall/2006/fs06-02-017.php},
year = {2006},
date = {2006-01-01},
booktitle = {Capturing and Using Patterns for Evidence Detection, Papers from the
2006 AAAI Fall Symposium, Washington, DC, USA, October 13-15, 2006},
volume = {FS-06-02},
pages = {112--119},
publisher = {AAAI Press},
series = {AAAI Technical Report},
abstract = {The interactions of professional football coaches and teams in the National Football League (NFL) form a complex social network. This network provides a great opportunity to analyze the influence that coaching mentors have on their proteges. In this paper, we use this social network to identify notable coaches and characterize championship coaches. We also utilize the coaching network to learn a model of which teams will make the playoffs in a given year. Developing comprehensive models of complex adaptive networks, such as the network of NFL coaches, poses a difficult challenge for researchers. From our analysis of the NFL, we identify three types of dependencies that any model of complex network data must be able to represent.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, David Jensen
Bias/variance analysis for network data Proceedings Article
In: Proceedings of the Workshop on Statistical Relational Learning, 23rd International Conference on Machine Learning, 2006.
@inproceedings{neville2006bias,
title = {Bias/variance analysis for network data},
author = {Jennifer Neville and David Jensen},
url = {http://www.cs.umd.edu/projects/srl2006/papers/srl06-neville.pdf},
year = {2006},
date = {2006-01-01},
booktitle = {Proceedings of the Workshop on Statistical Relational Learning, 23rd International Conference on Machine Learning},
abstract = {Bias/variance analysis is a useful tool for investigating the performance of machine learning algorithms. Conventional analysis decomposes loss into errors due to aspects of the learning process, but in relational and network applications, the inference process introduces an additional source of error. Collective inference techniques introduce additional error both through the use of approximate inference algorithms and through variation in the availability of test set information. To date, the impact of inference error on model performance has not been investigated. In this paper, we propose a new bias/variance framework that decomposes loss into errors due to both the learning and inference process. We evaluate performance of three relational models on synthetic data and use the framework to understand the reasons for poor model performance. With this understanding, we propose a number of directions to explore to improve model performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2005
Matthew Rattigan, David Jensen
The case for anomalous link discovery Journal Article
In: SIGKDD Explor., vol. 7, no. 2, pp. 41–47, 2005.
@article{DBLP:journals/sigkdd/RattiganJ05,
title = {The case for anomalous link discovery},
author = {Matthew Rattigan and David Jensen},
url = {https://doi.org/10.1145/1117454.1117460},
doi = {10.1145/1117454.1117460},
year = {2005},
date = {2005-01-01},
journal = {SIGKDD Explor.},
volume = {7},
number = {2},
pages = {41--47},
abstract = {In this paper, we describe the challenges inherent to the task of link prediction, and we analyze one reason why many link prediction models perform poorly. Specifically, we demonstrate the effects of the extremely large class skew associated with the link prediction task. We then present an alternate task --- anomalous link discovery (ALD) --- and qualitatively demonstrate the effectiveness of simple link prediction models for the ALD task. We show that even the simplistic structural models that perform poorly on link prediction can perform quite well at the ALD task.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Brian Gallagher, David Jensen, Brian Neil Levine, others
Explaining routing performance in disruption tolerant networks Journal Article
In: University of Massachusetts Amherst, Technical Report, 2005.
@article{gallagher2005explaining,
title = {Explaining routing performance in disruption tolerant networks},
author = {Brian Gallagher and David Jensen and Brian Neil Levine and others},
url = {https://kdl.cs.umass.edu/papers/gallagher-et-al-tr0557.pdf},
year = {2005},
date = {2005-01-01},
journal = {University of Massachusetts Amherst, Technical Report},
abstract = {Many routing algorithms for both traditional and ad hoc networks require a complete and contemporaneous path of peers from source to destination. Disruption Tolerant Networks (DTNs) attempt to deliver messages despite a frequently disconnected link layer (e.g., due to peer mobility, limited communication range, and power management limitations). While several algorithms have been proposed for routing in DTNs, this has not yet led to an understanding of the fundamental issues underlying routing performance in these networks. In this paper we explain the performance of routing algorithms for DTNs in terms of their ability to utilize a set of three no-cost drop criteria. The criteria are necessary and sufficient for identifying messages that may be dropped without degrading the overall delivery rate. The criteria identify whether a route exists with sufficient bandwidth, whether a message has been delivered already, and whether some other peer will deliver the message. We also use the criteria to design a new routing algorithm that we call NoCostDrop, which appears to be the first routing algorithm to take advantage of all three criteria. We show that NoCostDrop outperforms existing algorithms over a wide range of network conditions. Most novel in our approach is the use of a distributed list of delivered messages, which can easily be combined with existing routing algorithms to improve},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
George Dean Bissias, Marc Liberatore, David Jensen, Brian Neil Levine
Privacy Vulnerabilities in Encrypted HTTP Streams Proceedings Article
In: Privacy Enhancing Technologies, 5th International Workshop, PET 2005, Cavtat, Croatia, May 30-June 1, 2005, Revised Selected Papers, pp. 1–11, Springer, 2005.
@inproceedings{DBLP:conf/pet/BissiasLJL05,
title = {Privacy Vulnerabilities in Encrypted HTTP Streams},
author = {George Dean Bissias and Marc Liberatore and David Jensen and Brian Neil Levine},
url = {https://doi.org/10.1007/11767831_1},
doi = {10.1007/11767831_1},
year = {2005},
date = {2005-01-01},
booktitle = {Privacy Enhancing Technologies, 5th International Workshop, PET
2005, Cavtat, Croatia, May 30-June 1, 2005, Revised Selected Papers},
volume = {3856},
pages = {1--11},
publisher = {Springer},
series = {Lecture Notes in Computer Science},
abstract = {Encrypting traffic does not prevent an attacker from performing some types of traffic analysis. We present a straightforward traffic analysis attack against encrypted HTTP streams that is surprisingly effective in identifying the source of the traffic. An attacker starts by creating a profile of the statistical characteristics of web requests from interesting sites, including distributions of packet sizes and inter-arrival times. Later, candidate encrypted streams are compared against these profiles. In our evaluations using real traffic, we find that many web sites are subject to this attack. With a training period of 24 hours and a 1 hour delay afterwards, the attack achieves only 23% accuracy. However, an attacker can easily pre-determine which of trained sites are easily identifiable. Accordingly, against 25 such sites, the attack achieves 40% accuracy; with three guesses, the attack achieves 100% accuracy for our data. Longer delays after training decrease accuracy, but not substantially. We also propose some countermeasures and improvements to our current method. Previous work analyzed SSL traffic to a proxy, taking advantage of a known flaw in SSL that reveals the length of each web object. In contrast, we exploit the statistical characteristics of web streams that are encrypted as a single flow, which is the case with WEP/WPA, IPsec, and SSH tunnels.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, David Jensen
Leveraging Relational Autocorrelation with Latent Group Models Proceedings Article
In: Proceedings of the 5th IEEE International Conference on Data Mining (ICDM 2005), 27-30 November 2005, Houston, Texas, USA, pp. 322–329, IEEE Computer Society, 2005.
@inproceedings{DBLP:conf/icdm/NevilleJ05,
title = {Leveraging Relational Autocorrelation with Latent Group Models},
author = {Jennifer Neville and David Jensen},
url = {https://doi.org/10.1109/ICDM.2005.89},
doi = {10.1109/ICDM.2005.89},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings of the 5th IEEE International Conference on Data Mining
(ICDM 2005), 27-30 November 2005, Houston, Texas, USA},
pages = {322--329},
publisher = {IEEE Computer Society},
abstract = {The presence of autocorrelation provides a strong motivation for using relational learning and inference techniques. Autocorrelation is a statistical dependence between the values of the same variable on related entities and is a nearly ubiquitous characteristic of relational data sets. Recent research has explored the use of collective inference techniques to exploit this phenomenon. These techniques achieve significant performance gains by modeling observed correlations among class labels of related instances, but the models fail to capture a frequent cause of autocorrelation - the presence of underlying groups that influence the attributes on a set of entities. We propose a latent group model (LGM) for relational data, which discovers and exploits the hidden structures responsible for the observed autocorrelation among class labels. Modeling the latent group structure improves model performance, increases inference efficiency, and enhances our understanding of the datasets. We evaluate performance on three relational classification tasks and show that LGM outperforms models that ignore latent group structure, particularly when there is little information with which to seed inference.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, David Jensen, Brian Neil Levine
Creating social networks to improve peer-to-peer networking Proceedings Article
In: Proceedings of the Eleventh ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Chicago, Illinois, USA, August 21-24, 2005, pp. 568–573, ACM, 2005.
@inproceedings{DBLP:conf/kdd/FastJL05,
title = {Creating social networks to improve peer-to-peer networking},
author = {Andrew Fast and David Jensen and Brian Neil Levine},
url = {https://doi.org/10.1145/1081870.1081938},
doi = {10.1145/1081870.1081938},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings of the Eleventh ACM SIGKDD International Conference
on Knowledge Discovery and Data Mining, Chicago, Illinois, USA, August
21-24, 2005},
pages = {568--573},
publisher = {ACM},
abstract = {We use knowledge discovery techniques to guide the creation of efficient overlay networks for peer-to-peer file sharing. An overlay network specifies the logical connections among peers in a network and is distinct from the physical connections of the network. It determines the order in which peers will be queried when a user is searching for a specific file. To better understand the role of the network overlay structure in the performance of peer-to-peer file sharing protocols, we compare several methods for creating overlay networks. We analyze the networks using data from a campus network for peer-to-peer file sharing that recorded anonymized data on 6,528 users sharing 291,925 music files over an 81-day period. We propose a novel protocol for overlay creation based on a model of user preference identified by latent-variable clustering with hierarchical Dirichlet processes (HDPs). Our simulations and empirical studies show that the clusters of songs created by HDPs effectively model user behavior and can be used to create desirable network overlays that outperform alternative approaches.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Stephen Hart, Roderic A. Grupen, David Jensen
A Relational Representation for Procedural Task Knowledge Proceedings Article
In: Proceedings, The Twentieth National Conference on Artificial Intelligence and the Seventeenth Innovative Applications of Artificial Intelligence Conference, July 9-13, 2005, Pittsburgh, Pennsylvania, USA, pp. 1280–1285, AAAI Press / The MIT Press, 2005.
@inproceedings{DBLP:conf/aaai/HartGJ05,
title = {A Relational Representation for Procedural Task Knowledge},
author = {Stephen Hart and Roderic A. Grupen and David Jensen},
url = {http://www.aaai.org/Library/AAAI/2005/aaai05-203.php},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings, The Twentieth National Conference on Artificial Intelligence
and the Seventeenth Innovative Applications of Artificial Intelligence
Conference, July 9-13, 2005, Pittsburgh, Pennsylvania, USA},
pages = {1280--1285},
publisher = {AAAI Press / The MIT Press},
abstract = {This paper proposes a methodology for learning joint probability estimates regarding the effect of sensorimotor features on the predicated quality of desired behavior. These relationships can then be used to choose actions that will most likely produce success. relational dependency networks are used to learn statistical models of procedural task knowledge. An example task expert for picking up objects is learned through actual experience with a humanoid robot. We believe that this approach is widely applicable and has great potential to allow a robot to autonomously determine which features in the world are salient and should be used to recommend policy for action.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, David Jensen
The case for anomalous link detection Proceedings Article
In: Proceedings of the 4th international workshop on multi-relational mining, pp. 69–74, 2005.
@inproceedings{rattigan2005case,
title = {The case for anomalous link detection},
author = {Matthew Rattigan and David Jensen},
url = {https://dl.acm.org/doi/pdf/10.1145/1090193.1090205},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings of the 4th international workshop on multi-relational mining},
pages = {69--74},
abstract = {In this paper, we describe the challenges inherent to the Link Prediction (LP) problem in multirelational data mining, and explore the reasons why many LP models have performed poorly. We present the alternate (and complimentary) task of Anomalous Link Discovery (ALD) and qualitatively demonstrate the effectiveness of simple LP models for the ALD task.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Ozgur Simsek, David Jensen
A probabilistic framework for decentralized search in networks Proceedings Article
In: Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI), 2005.
@inproceedings{simsek2005probabilistic,
title = {A probabilistic framework for decentralized search in networks},
author = {Ozgur Simsek and David Jensen},
year = {2005},
date = {2005-01-01},
booktitle = {Proceedings of the International Joint Conference on Artificial Intelligence (IJCAI)},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Ozgur Simsek, David Jensen
Decentralized Search in Networks Using Homophily and Degree Disparity Proceedings Article
In: IJCAI-05, Proceedings of the Nineteenth International Joint Conference on Artificial Intelligence, Edinburgh, Scotland, UK, July 30 - August 5, 2005, pp. 304–310, Professional Book Center, 2005.
@inproceedings{SimsekJ05,
title = {Decentralized Search in Networks Using Homophily and Degree Disparity},
author = {Ozgur Simsek and David Jensen},
url = {http://ijcai.org/Proceedings/05/Papers/1509.pdf},
year = {2005},
date = {2005-01-01},
booktitle = {IJCAI-05, Proceedings of the Nineteenth International Joint Conference
on Artificial Intelligence, Edinburgh, Scotland, UK, July 30 - August
5, 2005},
pages = {304--310},
publisher = {Professional Book Center},
abstract = {We propose a new algorithm for finding a target node in a network whose topology is known only locally. We formulate this task as a problem of decision making under uncertainty and use the statistical properties of the graph to guide this decision. This formulation uses the homophily and degree structure of the network simultaneously, differentiating our algorithm from those previously proposed in the literature. Because homophily and degree disparity are characteristics frequently observed in real-world networks, the algorithm we propose is applicable to a wide variety of networks, including two families that have received much recent attention: small-world and scale-free networks.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
George Dean Bissias, Marc Liberatore, David Jensen, Brian Neil Levine
Privacy vulnerabilities in encrypted HTTP streams Proceedings Article
In: International Workshop on Privacy Enhancing Technologies, pp. 1–11, Springer 2005.
@inproceedings{bissias2005privacy,
title = {Privacy vulnerabilities in encrypted HTTP streams},
author = {George Dean Bissias and Marc Liberatore and David Jensen and Brian Neil Levine},
url = {https://scholarworks.umass.edu/cgi/viewcontent.cgi?referer=https://www.google.com/&httpsredir=1&article=1097&context=cs_faculty_pubs},
year = {2005},
date = {2005-01-01},
booktitle = {International Workshop on Privacy Enhancing Technologies},
pages = {1--11},
organization = {Springer},
abstract = {Encrypting traffic does not prevent an attacker from performing some types of traffic analysis. We present a straightforward traffic analysis attack against encrypted HTTP streams that is surprisingly effective in identifying the source of the traffic. An attacker starts by creating a profile of the statistical characteristics of web requests from interesting sites, including distributions of packet sizes and inter-arrival times. Later, candidate encrypted streams are compared against these profiles. In our evaluations using real traffic, we find that many web sites are subject to this attack. With a training period of 24 hours and a 1 hour delay afterwards, the attack achieves only 23% accuracy. However, an attacker can easily pre-determine which of trained sites are easily identifiable. Accordingly, against 25 such sites, the attack achieves 40% accuracy; with three guesses, the attack achieves 100% accuracy for our data. Longer delays after training decrease accuracy, but not substantially. We also propose some countermeasures and improvements to our current method. Previous work analyzed SSL traffic to a proxy, taking advantage of a known flaw in SSL that reveals the length of each web object. In contrast, we exploit the statistical characteristics of web streams that are encrypted as a single flow, which is the case with WEP/WPA, IPsec, and SSH tunnels.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2004
David Jensen, Jennifer Neville, Brian Gallagher
Why collective inference improves relational classification Proceedings Article
In: Proceedings of the Tenth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Seattle, Washington, USA, August 22-25, 2004, pp. 593–598, ACM, 2004.
@inproceedings{DBLP:conf/kdd/JensenNG04,
title = {Why collective inference improves relational classification},
author = {David Jensen and Jennifer Neville and Brian Gallagher},
url = {https://doi.org/10.1145/1014052.1014125},
doi = {10.1145/1014052.1014125},
year = {2004},
date = {2004-01-01},
booktitle = {Proceedings of the Tenth ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, Seattle, Washington, USA, August
22-25, 2004},
pages = {593--598},
publisher = {ACM},
abstract = {Procedures for collective inference make simultaneous statistical judgments about the same variables for a set of related data instances. For example, collective inference could be used to simultaneously classify a set of hyperlinked documents or infer the legitimacy of a set of related financial transactions. Several recent studies indicate that collective inference can significantly reduce classification error when compared with traditional inference techniques. We investigate the underlying mechanisms for this error reduction by reviewing past work on collective inference and characterizing different types of statistical models used for making inference in relational data. We show important differences among these models, and we characterize the necessary and sufficient conditions for reduced classification error based on experiments with real and simulated data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, David Jensen
Dependency Networks for Relational Data Proceedings Article
In: Proceedings of the 4th IEEE International Conference on Data Mining (ICDM 2004), 1-4 November 2004, Brighton, UK, pp. 170–177, IEEE Computer Society, 2004.
@inproceedings{DBLP:conf/icdm/NevilleJ04,
title = {Dependency Networks for Relational Data},
author = {Jennifer Neville and David Jensen},
url = {https://doi.org/10.1109/ICDM.2004.10101},
doi = {10.1109/ICDM.2004.10101},
year = {2004},
date = {2004-01-01},
booktitle = {Proceedings of the 4th IEEE International Conference on Data Mining
(ICDM 2004), 1-4 November 2004, Brighton, UK},
pages = {170--177},
publisher = {IEEE Computer Society},
abstract = {Instance independence is a critical assumption of traditional machine learning methods contradicted by many relational datasets. For example, in scientific literature datasets, there are dependencies among the references of a paper. Recent work on graphical models for relational data has demonstrated significant performance gains for models that exploit the dependencies among instances. In this paper, we present relational dependency networks (RDNs), a new form of graphical model capable of reasoning with such dependencies in a relational setting. We describe the details of RDN models and outline their strengths, most notably the ability to learn and reason with cyclic relational dependencies. We present RDN models learned on a number of real-world datasets, and evaluate the models in a classification context, showing significant performance improvements. In addition, we use synthetic data to evaluate the quality of model learning and inference procedures.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, Micah Adler, David Jensen
Spectral clustering with links and attributes Technical Report
MASSACHUSETTS UNIV AMHERST DEPT OF COMPUTER SCIENCE 2004.
@techreport{neville2004spectral,
title = {Spectral clustering with links and attributes},
author = {Jennifer Neville and Micah Adler and David Jensen},
url = {https://www.cs.purdue.edu/homes/neville/papers/neville-et-al-tr0442.pdf},
year = {2004},
date = {2004-01-01},
institution = {MASSACHUSETTS UNIV AMHERST DEPT OF COMPUTER SCIENCE},
abstract = {If relational data contain communities—groups of inter-related items with similar attribute values—a clustering technique that considers attribute information and the structure of relations simultaneously should produce more meaningful clusters than those produced by considering attributes alone. We investigate this hypothesis in the context of a spectral graph partitioning technique, considering a number of hybrid similarity metrics that combine both sources of information. Through simulation, we find that two of the hybrid metrics achieve superior performance over a wide range of data characteristics. We analyze the spectral decomposition algorithm from a statistical perspective and show that the successful hybrid metrics exaggerate the separation between cluster similarity values, at the expense of increased variance. We cluster several relational datasets using the best hybrid metric and show that the resulting clusters exhibit significant community structure, and that they significantly improve performance in a related classification task.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Jennifer Neville, Ozgur Simsek, David Jensen
Autocorrelation and relational learning: Challenges and opportunities Technical Report
MASSACHUSETTS UNIV AMHERST DEPT OF COMPUTER SCIENCE 2004.
@techreport{neville2004autocorrelation,
title = {Autocorrelation and relational learning: Challenges and opportunities},
author = {Jennifer Neville and Ozgur Simsek and David Jensen},
url = {http://www.cs.umd.edu/projects/srl2004/Papers/neville.pdf},
year = {2004},
date = {2004-01-01},
institution = {MASSACHUSETTS UNIV AMHERST DEPT OF COMPUTER SCIENCE},
abstract = {Autocorrelation, a common characteristic of many datasets, refers to correlation between values of the same variable on related objects. It violates the critical assumption of instance independence that underlies most conventional models. In this paper, we provide an overview of research on autocorrelation in a number of fields with an emphasis on implications for relational learning, and outline a number of challenges and opportunities for model learning and inference.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
2003
Andrew McCallum, David Jensen
A note on the unification of information extraction and data mining using conditional-probability, relational models Journal Article
In: Computer Science Department Faculty Publication Series, pp. 42, 2003.
@article{mccallum2003note,
title = {A note on the unification of information extraction and data mining using conditional-probability, relational models},
author = {Andrew McCallum and David Jensen},
url = {http://ciir.cs.umass.edu/pubfiles/ir-306.pdf},
year = {2003},
date = {2003-01-01},
journal = {Computer Science Department Faculty Publication Series},
pages = {42},
abstract = {Although information extraction and data mining appear together in many applications, their interface in most current systems would better be described as serial juxtaposition than as tight integration. Information extraction populates slots in a database by identifying relevant subsequences of text, but is usually not aware of the emerging patterns and regularities in the database. Data mining methods begin from a populated database, and are often unaware of where the data came from, or its inherent uncertainties. The result is that the accuracy of both suffers, and significant mining of complex text sources is beyond reach. This position paper proposes the use of unified, relational, undirected graphical models for information extraction and data mining, in which extraction decisions and data-mining decisions are made in the same probabilistic “currency,” with a common inference procedure—each component thus being able to make up for the weaknesses of the other and therefore improving the performance of both. For example, data mining run on a partiallyfilled database can find patterns that provide “topdown” accuracy-improving constraints to information extraction. Information extraction can provide a much richer set of “bottom-up” hypotheses to data mining if the mining is set up to handle additional uncertainty information from extraction. We outline an approach and describe several models, but provide no experimental results.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Amy McGovern, Lisa Friedland, Michael Hay, Brian Gallagher, Andrew Fast, Jennifer Neville, David Jensen
Exploiting relational structure to understand publication patterns in high-energy physics Journal Article
In: SIGKDD Explor., vol. 5, no. 2, pp. 165–172, 2003.
@article{DBLP:journals/sigkdd/McGovernFHGFNJ03,
title = {Exploiting relational structure to understand publication patterns
in high-energy physics},
author = {Amy McGovern and Lisa Friedland and Michael Hay and Brian Gallagher and Andrew Fast and Jennifer Neville and David Jensen},
url = {https://doi.org/10.1145/980972.980999},
doi = {10.1145/980972.980999},
year = {2003},
date = {2003-01-01},
journal = {SIGKDD Explor.},
volume = {5},
number = {2},
pages = {165--172},
abstract = {We analyze publication patterns in theoretical high-energy physics using a relational learning approach. We focus on four related areas: understanding and identifying patterns of citations, examining publication patterns at the author level, predicting whether a paper will be accepted by specific journals, and identifying research communities from the citation patterns and paper text. Each of these analyses contributes to an overall understanding of theoretical high-energy physics.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Jensen, Jennifer Neville
Data mining in social networks Book
na, 2003.
@book{jensen2003data,
title = {Data mining in social networks},
author = {David Jensen and Jennifer Neville},
url = {https://scholarworks.umass.edu/cgi/viewcontent.cgi?article=1069&context=cs_faculty_pubs},
year = {2003},
date = {2003-01-01},
publisher = {na},
abstract = {Several techniques for learning statistical models have been developed recently by researchers in machine learning and data mining. All of these techniques must address a similar set of representational and algorithmic choices and must face a set of statistical challenges unique to learning from relational data.},
keywords = {},
pubstate = {published},
tppubtype = {book}
}
Jennifer Neville, David Jensen, Lisa Friedland, Michael Hay
Learning relational probability trees Proceedings Article
In: Proceedings of the Ninth ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Washington, DC, USA, August 24 - 27, 2003, pp. 625–630, ACM, 2003.
@inproceedings{DBLP:conf/kdd/NevilleJFH03,
title = {Learning relational probability trees},
author = {Jennifer Neville and David Jensen and Lisa Friedland and Michael Hay},
url = {https://doi.org/10.1145/956750.956830},
doi = {10.1145/956750.956830},
year = {2003},
date = {2003-01-01},
booktitle = {Proceedings of the Ninth ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, Washington, DC, USA, August 24
- 27, 2003},
pages = {625--630},
publisher = {ACM},
abstract = {Classification trees are widely used in the machine learning and data mining communities for modeling propositional data. Recent work has extended this basic paradigm to probability estimation trees. Traditional tree learning algorithms assume that instances in the training data are homogenous and independently distributed. Relational probability trees (RPTs) extend standard probability estimation trees to a relational setting in which data instances are heterogeneous and interdependent. Our algorithm for learning the structure and parameters of an RPT searches over a space of relational features that use aggregation functions (e.g. AVERAGE, MODE, COUNT) to dynamically propositionalize relational data and create binary splits within the RPT. Previous work has identified a number of statistical biases due to characteristics of relational data such as autocorrelation and degree disparity. The RPT algorithm uses a novel form of randomization test to adjust for these biases. On a variety of relational learning tasks, RPTs built using randomization tests are significantly smaller than other models and achieve equivalent, or better, performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jennifer Neville, David Jensen
Collective classification with relational dependency networks Proceedings Article
In: Workshop on Multi-Relational Data Mining (MRDM-2003), pp. 77, 2003.
@inproceedings{neville2003collective,
title = {Collective classification with relational dependency networks},
author = {Jennifer Neville and David Jensen},
url = {https://www.cs.purdue.edu/homes/neville/papers/neville-jensen-mrdm2003.pdf},
year = {2003},
date = {2003-01-01},
booktitle = {Workshop on Multi-Relational Data Mining (MRDM-2003)},
pages = {77},
abstract = {Collective classification models exploit the dependencies in a network of objects to improve predictions. For example, in a network of web pages, the topic of a page may depend on the topics of hyperlinked pages. A relational model capable of expressing and reasoning with such dependencies should achieve superior performance to relational models that ignore such dependencies. In this paper, we present relational dependency networks (RDNs), extending recent work in dependency networks to a relational setting. RDNs are a collective classification model that offers simple parameter estimation and efficient structure learning. On two real-world data sets, we compare RDNs to ordinary classification with relational probability trees and show that collective classification improves performance.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}