Publications Search
Lisa Friedland, David Jensen, Michael Lavine
Copy or Coincidence? A Model for Detecting Social Influence and Duplication Events Proceedings Article
In: Proceedings of the 30th International Conference on Machine Learning, ICML 2013, Atlanta, GA, USA, 16-21 June 2013, pp. 1175–1183, JMLR.org, 2013.
Abstract | Links | BibTeX | Tags:
@inproceedings{DBLP:conf/icml/FriedlandJL13,
title = {Copy or Coincidence? A Model for Detecting Social Influence and
Duplication Events},
author = {Lisa Friedland and David Jensen and Michael Lavine},
url = {http://proceedings.mlr.press/v28/friedland13.html},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 30th International Conference on Machine Learning,
ICML 2013, Atlanta, GA, USA, 16-21 June 2013},
volume = {28},
pages = {1175--1183},
publisher = {JMLR.org},
series = {JMLR Workshop and Conference Proceedings},
abstract = {In this paper, we analyze the task of inferring rare links between pairs of entities that seem too similar to have occurred by chance. Variations of this task appear in such diverse areas as social network analysis, security, fraud detection, and entity resolution. To address the task in a general form, we propose a simple, flexible mixture model in which most entities are generated independently from a distribution but a small number of pairs are constrained to be similar. We predict the true pairs using a likelihood ratio that trades off the entities’ similarity with their rarity. This method always outperforms using only similarity; however, with certain parameter settings, similarity turns out to be surprisingly competitive. Using real data, we apply the model to detect twins given their birth weights and to re-identify cell phone users based on distinctive usage patterns.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Katerina Marazopoulou, David Arbour, David Jensen
Flattening network data for causal discovery: What could go wrong? Proceedings Article
In: Workshop on Information in Networks, 2013.
Abstract | Links | BibTeX | Tags:
@inproceedings{maier2013flattening,
title = {Flattening network data for causal discovery: What could go wrong?},
author = {Marc Maier and Katerina Marazopoulou and David Arbour and David Jensen},
url = {https://www.semanticscholar.org/paper/Flattening-network-data-for-causal-discovery-%3A-What-Maier-Marazopoulou/c327100636c022c259f5e1bf2d7fcbbd0b048935},
year = {2013},
date = {2013-01-01},
booktitle = {Workshop on Information in Networks},
volume = {64},
abstract = {Methods for learning causal dependencies from observational data have been the focus of decades of work in social science, statistics, machine learning, and philosophy [9, 10, 11]. Much of the theoretical and practical work on causal discovery has focused on propositional representations. Propositional models effectively represent individual directed causal dependencies (e.g., path analysis, Bayesian networks) or conditional distributions of some outcome variable (e.g., linear regression, decision trees). However, propositional representations are limited to modeling independent and identically distributed (IID) data of a single entity type. Many real-world systems involve heterogeneous, interacting entities with probabilistic dependencies that cross the boundaries of those entities (i.e., non-IID data with multiple entity types and relationships). These systems produce network, or relational, data, and they are of paramount interest to researchers and practitioners across a wide range of disciplines. To model such data, researchers in statistics and computer science have devised more expressive classes of directed graphical models, such as probabilistic relational models (PRMs) [2] and directed acyclic probabilistic entityrelationship (DAPER) models [4]. Despite the assumptions embedded in propositional models, a common practice is to flatten, or propositionalize, relational data and use existing algorithms [5] (see Figure 1, focusing on algorithms that learn causal graphical models). While there are statistical concerns, this process is generally innocuous if the task is to model statistical associations for predictive inference. In contrast, to learn causal structure, estimate causal effects, or support inference over interventions, the effects of flattening inherently relational data can be particularly deleterious. In this paper, we identify four classes of potential issues that can occur with a propositionalization strategy as opposed to embracing a more expressive representation that would not succumb to these problems. We also present empirical results comparing the effectiveness of two theoretically sound and complete algorithms that learn causal structure: PC—a widely used constraint-based, propositional algorithm for causal discovery [11], and RCD—a recently developed constraint-based algorithm that reasons over a relational representation [6].},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Katerina Marazopoulou, David Jensen
Reasoning about Independence in Probabilistic Models of Relational Data Miscellaneous
2013.
Abstract | Links | BibTeX | Tags: Causal Modeling
@misc{DBLP:journals/corr/abs-1302-4381,
title = {Reasoning about Independence in Probabilistic Models of Relational
Data},
author = {Marc Maier and Katerina Marazopoulou and David Jensen},
url = {http://arxiv.org/abs/1302.4381},
year = {2013},
date = {2013-01-01},
journal = {CoRR},
volume = {abs/1302.4381},
abstract = {We extend the theory of d-separation to cases in which data instances are not independent and identically distributed. We show that applying the rules of d-separation directly to the structure of probabilistic models of relational data inaccurately infers conditional independence. We introduce relational d-separation, a theory for deriving conditional independence facts from relational models. We provide a new representation, the abstract ground graph, that enables a sound, complete, and computationally efficient method for answering d-separation queries about relational models, and we present empirical results that demonstrate effectiveness.},
keywords = {Causal Modeling},
pubstate = {published},
tppubtype = {misc}
}
Matthew Rattigan
Leveraging Relational Representations for Causal Discovery PhD Thesis
2012, ISBN: 9781267786821, (AAI3545976).
@phdthesis{10.5555/2520420,
title = {Leveraging Relational Representations for Causal Discovery},
author = {Matthew Rattigan},
isbn = {9781267786821},
year = {2012},
date = {2012-01-01},
publisher = {University of Massachusetts Amherst},
abstract = {This thesis represents a synthesis of relational learning and causal discovery, two subjects at the frontier of machine learning research. Relational learning investigates algorithms for constructing statistical models of data drawn from of multiple types of interrelated entities, and causal discovery investigates algorithms for constructing causal models from observational data. My work demonstrates that there exists a natural, methodological synergy between these two areas of study, and that despite the sometimes onerous nature of each, their combination (perhaps counterintuitively) can provide advances in the state of the art for both. Traditionally, propositional (or "flat") data representations have dominated the statistical sciences. These representations assume that data consist of independent and identically distributed (iid) entities which can be represented by a single data table. More recently, data scientists have increasingly focused on "relational" data sets that consist of interrelated, heterogeneous entities. However, relational learning and causal discovery are rarely combined. Relational representations are wholly absent from the literature where causality is discussed explicitly. Instead, the literature on causality that uses the framework of graphical models assumes that data are independent and identically distributed. This unexplored topical intersection represents an opportunity for advancement — by combining relational learning with causal reasoning, we can provide insight into the challenges found in each subject area. By adopting a causal viewpoint, we can clarify the mechanisms that produce previously identified pathologies in relational learning. Analogously, we can utilize relational data to establish and strengthen causal claims in ways that are impossible using only propositional representations.},
note = {AAI3545976},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
Huseyin Oktay, A Soner Balkir, Ian Foster, David Jensen
Distance estimation for very large networks using mapreduce and network structure indices Proceedings Article
In: Workshop on Information Networks, 2011.
@inproceedings{oktay2011distance,
title = {Distance estimation for very large networks using mapreduce and network structure indices},
author = {Huseyin Oktay and A Soner Balkir and Ian Foster and David Jensen},
year = {2011},
date = {2011-01-01},
booktitle = {Workshop on Information Networks},
abstract = {The ability to discover low-cost paths in networks has practical consequences for knowledge discovery and social network analysis tasks. Many analytic techniques for networks require finding low-cost paths, but exact methods for search become prohibitive for large networks, and data sets are steadily increasing in size. Short paths can be found efficiently by utilizing an index of network structure, which estimates network distances and enables rapid discovery of short paths. Through experiments on synthetic networks, we demonstrate that one such novel network structure index based on the shortest-path tree outperforms other previously proposed indices. We also show that it generalizes across arbitrarily weighted networks of various structures and densities, provides accurate estimates of distance, and has efficient time and space complexity. We present results on real data sets for several applications, including navigation, diameter estimation, centrality computation, and clustering---all made efficient by virtue of the network structure index.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Matthew Rattigan, David Jensen
Indexing Network Structure with Shortest-Path Trees Journal Article
In: ACM Trans. Knowl. Discov. Data, vol. 5, no. 3, 2011, ISSN: 1556-4681.
Abstract | Links | BibTeX | Tags: Navigation and Routing in Networks
@article{10.1145/1993077.1993079,
title = {Indexing Network Structure with Shortest-Path Trees},
author = {Marc Maier and Matthew Rattigan and David Jensen},
url = {https://doi.org/10.1145/1993077.1993079},
doi = {10.1145/1993077.1993079},
issn = {1556-4681},
year = {2011},
date = {2011-08-01},
journal = {ACM Trans. Knowl. Discov. Data},
volume = {5},
number = {3},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {The ability to discover low-cost paths in networks has practical consequences for knowledge discovery and social network analysis tasks. Many analytic techniques for networks require finding low-cost paths, but exact methods for search become prohibitive for large networks, and data sets are steadily increasing in size. Short paths can be found efficiently by utilizing an index of network structure, which estimates network distances and enables rapid discovery of short paths. Through experiments on synthetic networks, we demonstrate that one such novel network structure index based on the shortest-path tree outperforms other previously proposed indices. We also show that it generalizes across arbitrarily weighted networks of various structures and densities, provides accurate estimates of distance, and has efficient time and space complexity. We present results on real data sets for several applications, including navigation, diameter estimation, centrality computation, and clustering---all made efficient by virtue of the network structure index.},
keywords = {Navigation and Routing in Networks},
pubstate = {published},
tppubtype = {article}
}
Phillip B Kirlin, David Jensen
Probabilistic Modeling of Hierarchical Music Analysis. Proceedings Article
In: Proceedings of the 12th International Society for Music Information Retrieval Conference, ISMIR, pp. 393–398, 2011.
Abstract | Links | BibTeX | Tags:
@inproceedings{kirlin2011probabilistic,
title = {Probabilistic Modeling of Hierarchical Music Analysis.},
author = {Phillip B Kirlin and David Jensen},
url = {http://ismir2011.ismir.net/papers/PS3-7.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 12th International Society for Music Information
Retrieval Conference, ISMIR},
pages = {393--398},
abstract = {Hierarchical music analysis, as exemplified by Schenkerian analysis, describes the structure of a musical composition by a hierarchy among its notes. Each analysis defines a set of prolongations, where musical objects persist in time even though others are present. We present a formal model for representing hierarchical music analysis, probabilistic interpretations of that model, and an efficient algorithm for computing the most probable analysis under these interpretations. We represent Schenkerian analyses as maximal outerplanar graphs (MOPs). We use this representation to encode the largest known data set of computer-processable Schenkerian analyses, and we use these data to identify statistical regularities in the human-generated analyses. We show that a dynamic programming algorithm can be applied to these regularities to identify the maximum likelihood analysis for a given piece of music.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, Marc Maier, David Jensen
Relational blocking for causal discovery Proceedings Article
In: Proceedings of the Twenty-Fifth AAAI Conference on Artificial Intelligence, 2011.
Abstract | Links | BibTeX | Tags: Causal Modeling
@inproceedings{rattigan2011relational,
title = {Relational blocking for causal discovery},
author = {Matthew Rattigan and Marc Maier and David Jensen},
url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI11/paper/view/3760},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the Twenty-Fifth AAAI Conference on Artificial Intelligence},
volume = {25},
number = {1},
abstract = {Blocking is a technique commonly used in manual statistical analysis to account for confounding variables. However, blocking is not currently used in automated learning algorithms. These algorithms rely solely on statistical conditioning as an operator to identify conditional independence. In this work, we present relational blocking as a new operator that can be used for learning the structure of causal models. We describe how blocking is enabled by relational data sets, where blocks are determined by the links in the network. By blocking on entities rather than conditioning on variables, relational blocking can account for both measured and unobserved variables. We explain the mechanism of these methods using graphical models and the semantics of d-separation. Finally, we demonstrate the effectiveness of relational blocking for use in causal discovery by showing how blocking can be used in the causal analysis of two real-world social media systems.},
keywords = {Causal Modeling},
pubstate = {published},
tppubtype = {inproceedings}
}
Michael Hay, Gerome Miklau, David Jensen
Analyzing private network data Journal Article
In: Privacy-aware knowledge discovery: Novel applications and new techniques, pp. 459–498, 2010.
BibTeX | Tags:
@article{hay2010analyzing,
title = {Analyzing private network data},
author = {Michael Hay and Gerome Miklau and David Jensen},
year = {2010},
date = {2010-01-01},
journal = {Privacy-aware knowledge discovery: Novel applications and new techniques},
pages = {459--498},
publisher = {Chapman & Hall/CRC},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Huseyin Oktay, Brian Taylor, David Jensen
Causal discovery in social media using quasi-experimental designs Proceedings Article
In: Proceedings of the 3rd Workshop on Social Network Mining and Analysis, SNAKDD, pp. 1–9, 2010.
Abstract | Links | BibTeX | Tags: Causal Modeling
@inproceedings{oktay2010causal,
title = {Causal discovery in social media using quasi-experimental designs},
author = {Huseyin Oktay and Brian Taylor and David Jensen},
url = {https://doi.org/10.1145/1964858.1964859},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 3rd Workshop on Social Network Mining and Analysis,
SNAKDD},
pages = {1--9},
abstract = {Social media systems have become increasingly attractive to both users and companies providing those systems. Efficient management of these systems is essential and requires knowledge of cause-and-effect relationships within the system. Online experimentation can be used to discover causal knowledge; however, this ignores the observational data that is already being collected for operational purposes. Quasi-experimental designs (QEDs) are commonly used in social sciences to discover causal knowledge from observational data, and QEDs can be exploited to discover causal knowledge about social media systems. In this paper, we apply three different QEDs to demonstrate how one can gain a causal understanding of a social media system. The conclusions drawn from using a QED can have threats to their validity, but we show how one can carefully construct sophisticated designs to overcome some of those threats.},
keywords = {Causal Modeling},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Brian Taylor, Huseyin Oktay, David Jensen
Learning causal models of relational domains Proceedings Article
In: Proceedings of the Twenty-Fourth AAAI Conference on Artificial Intelligence, AAAI 2010, Atlanta, Georgia, USA, July 11-15, 2010, 2010.
Abstract | Links | BibTeX | Tags: Causal Modeling
@inproceedings{maier2010learning,
title = {Learning causal models of relational domains},
author = {Marc Maier and Brian Taylor and Huseyin Oktay and David Jensen},
url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI10/paper/view/1919},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the Twenty-Fourth AAAI Conference on Artificial Intelligence,
AAAI 2010, Atlanta, Georgia, USA, July 11-15, 2010},
volume = {24},
number = {1},
abstract = {Methods for discovering causal knowledge from observational data have been a persistent topic of AI research for several decades. Essentially all of this work focuses on knowledge representations for propositional domains. In this paper, we present several key algorithmic and theoretical innovations that extend causal discovery to relational domains. We provide strong evidence that effective learning of causal models is enhanced by relational representations. We present an algorithm, relational PC, that learns causal dependencies in a state-of-the-art relational representation, and we identify the key representational and algorithmic innovations that make the algorithm possible. Finally, we prove the algorithm's theoretical correctness and demonstrate its effectiveness on synthetic and real data sets.},
keywords = {Causal Modeling},
pubstate = {published},
tppubtype = {inproceedings}
}
Matthew Rattigan, David Jensen
Leveraging d-separation for relational data sets Proceedings Article
In: ICDM 2010, The 10th IEEE International Conference on Data Mining, pp. 989–994, IEEE 2010.
Abstract | Links | BibTeX | Tags:
@inproceedings{rattigan2010leveraging,
title = {Leveraging d-separation for relational data sets},
author = {Matthew Rattigan and David Jensen},
url = {https://doi.org/10.1109/ICDM.2010.142},
year = {2010},
date = {2010-01-01},
booktitle = {ICDM 2010, The 10th IEEE International Conference on Data Mining},
pages = {989--994},
organization = {IEEE},
abstract = {Testing for marginal and conditional independence is a common task in machine learning and knowledge discovery applications. Prior work has demonstrated that conventional independence tests suffer from dramatically increased rates of Type I errors when naively applied to relational data. We use graphical models to specify the conditions under which these errors occur, and use those models to devise novel and accurate conditional independence tests.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Michael Hay, Gerome Miklau, David Jensen, Don Towsley, Chao Li
Resisting structural re-identification in anonymized social networks Journal Article
In: The VLDB Journal, vol. 19, no. 6, pp. 797–823, 2010.
Abstract | Links | BibTeX | Tags:
@article{hay2010resisting,
title = {Resisting structural re-identification in anonymized social networks},
author = {Michael Hay and Gerome Miklau and David Jensen and Don Towsley and Chao Li},
url = {https://doi.org/10.1007/s00778-010-0210-x},
year = {2010},
date = {2010-01-01},
journal = {The VLDB Journal},
volume = {19},
number = {6},
pages = {797--823},
publisher = {Springer-Verlag},
abstract = {We identify privacy risks associated with releasing network data sets and provide an algorithm that mitigates those risks. A network consists of entities connected by links representing relations such as friendship, communication, or shared activity. Maintaining privacy when publishing networked data is uniquely challenging because an individual's network context can be used to identify them even if other identifying information is removed. In this paper, we quantify the privacy risks associated with three classes of attacks on the privacy of individuals in networks, based on the knowledge used by the adversary. We show that the risks of these attacks vary greatly based on network structure and size. We propose a novel approach to anonymizing network data that models aggregate network structure and then allows samples to be drawn from that model. The approach guarantees anonymity for network entities while preserving the ability to estimate a wide variety of network measures with relatively little bias.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Brian Delaney, Andrew Fast, W Campbell, C Weinstein, David Jensen
The application of statistical relational learning to a database of criminal and terrorist activity Proceedings Article
In: Proceedings of the 2010 SIAM International Conference on Data Mining, pp. 409–417, Society for Industrial and Applied Mathematics 2010.
Abstract | Links | BibTeX | Tags:
@inproceedings{delaney2010application,
title = {The application of statistical relational learning to a database of criminal and terrorist activity},
author = {Brian Delaney and Andrew Fast and W Campbell and C Weinstein and David Jensen},
url = {https://doi.org/10.1137/1.9781611972801.36},
year = {2010},
date = {2010-01-01},
booktitle = {Proceedings of the 2010 SIAM International Conference on Data Mining},
pages = {409--417},
organization = {Society for Industrial and Applied Mathematics},
abstract = {We apply statistical relational learning to a database of criminal and terrorist activity to predict attributes and event outcomes. The database stems from a collection of news articles and court records which are carefully annotated with a variety of variables, including categorical and continuous fields. Manual analysis of this data can help inform decision makers seeking to curb violent activity within a region. We use this data to build relational models from historical data to predict attributes of groups, individuals, or events. Our first example involves predicting social network roles within a group under a variety of different data conditions. Collective classification can be used to boost the accuracy under data poor conditions. Additionally, we were able to predict the outcome of hostage negotiations using models trained on previous kidnapping events. The overall framework and techniques described here are flexible enough to be used to predict a variety of variables. Such predictions could be used as input to a more complex system to recognize intent of terrorist groups or as input to inform human decision makers.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Michael Hay, Chao Li, Gerome Miklau, David Jensen
Accurate Estimation of the Degree Distribution of Private Networks Proceedings Article
In: ICDM 2009, The Ninth IEEE International Conference on Data Mining, Miami, Florida, USA, 6-9 December 2009, pp. 169–178, IEEE Computer Society, 2009.
Abstract | Links | BibTeX | Tags: Privacy and Networks
@inproceedings{hay2009accurate,
title = {Accurate Estimation of the Degree Distribution of Private Networks},
author = {Michael Hay and Chao Li and Gerome Miklau and David Jensen},
url = {https://doi.org/10.1109/ICDM.2009.11},
year = {2009},
date = {2009-01-01},
booktitle = {ICDM 2009, The Ninth IEEE International Conference on Data Mining,
Miami, Florida, USA, 6-9 December 2009},
pages = {169--178},
publisher = {IEEE Computer Society},
abstract = {We describe an efficient algorithm for releasing a provably private estimate of the degree distribution of a network. The algorithm satisfies a rigorous property of differential privacy, and is also extremely efficient, running on networks of 100 million nodes in a few seconds. Theoretical analysis shows that the error scales linearly with the number of unique degrees, whereas the error of conventional techniques scales linearly with the number of nodes. We complement the theoretical analysis with a thorough empirical analysis on real and synthetic graphs, showing that the algorithm’s variance and bias is low, that the error diminishes as the size of the input graph increases, and that common analyses like fitting a power-law can be carried out very accurately.},
keywords = {Privacy and Networks},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, David Jensen
Constraint relaxation for learning the structure of Bayesian networks Technical Report
Tech Report 09-18, University of Massachusetts Amherst, Computer Science~… 2009.
Abstract | Links | BibTeX | Tags:
@techreport{fast2009constraint,
title = {Constraint relaxation for learning the structure of Bayesian networks},
author = {Andrew Fast and David Jensen},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.217.7971&rep=rep1&type=pdf},
year = {2009},
date = {2009-01-01},
institution = {Tech Report 09-18, University of Massachusetts Amherst, Computer Science~…},
abstract = {This paper introduces constraint relaxation, a new strategy for learning the structure of Bayesian networks. Constraint relaxation identifies and “relaxes” possibly inaccurate independence constraints on the structure of the model. We describe a heuristic algorithm for constraint relaxation that combines greedy search in the space of undirected skeletons with edge orientation based on the constraints. This approach produces significant improvements in the structural accuracy of the learned models compared to four well-known structure learning algorithms in an empirical evaluation using data sampled from both real-world and randomly generated networks.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}
Jennifer Neville, David Jensen
A bias/variance decomposition for models using collective inference Journal Article
In: Machine Learning, vol. 73, no. 1, pp. 87–106, 2008.
Abstract | Links | BibTeX | Tags:
@article{neville2008bias,
title = {A bias/variance decomposition for models using collective inference},
author = {Jennifer Neville and David Jensen},
url = {https://doi.org/10.1007/s10994-008-5066-6},
year = {2008},
date = {2008-01-01},
journal = {Machine Learning},
volume = {73},
number = {1},
pages = {87--106},
publisher = {Springer US},
abstract = {Bias/variance analysis is a useful tool for investigating the performance of machine learning algorithms. Conventional analysis decomposes loss into errors due to aspects of the learning process, but in relational domains, the inference process used for prediction introduces an additional source of error. Collective inference techniques introduce additional error, both through the use of approximate inference algorithms and through variation in the availability of test-set information. To date, the impact of inference error on model performance has not been investigated. We propose a new bias/variance framework that decomposes loss into errors due to both the learning and inference processes. We evaluate the performance of three relational models on both synthetic and real-world datasets and show that (1) inference can be a significant source of error, and (2) the models exhibit different types of errors as data characteristics are varied.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Jensen, Andrew Fast, Brian Taylor, Marc Maier
Automatic identification of quasi-experimental designs for discovering causal knowledge Proceedings Article
In: Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, Las Vegas, Nevada, USA, August 24-27, 2008, pp. 372–380, 2008.
Abstract | Links | BibTeX | Tags:
@inproceedings{jensen2008automatic,
title = {Automatic identification of quasi-experimental designs for discovering causal knowledge},
author = {David Jensen and Andrew Fast and Brian Taylor and Marc Maier},
url = {https://doi.org/10.1145/1401890.1401938},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, Las Vegas, Nevada, USA, August
24-27, 2008},
pages = {372--380},
abstract = {Researchers in the social and behavioral sciences routinely rely on quasi-experimental designs to discover knowledge from large data-bases. Quasi-experimental designs (QEDs) exploit fortuitous circumstances in non-experimental data to identify situations (sometimes called "natural experiments") that provide the equivalent of experimental control and randomization. QEDs allow researchers in domains as diverse as sociology, medicine, and marketing to draw reliable inferences about causal dependencies from non-experimental data. Unfortunately, identifying and exploiting QEDs has remained a painstaking manual activity, requiring researchers to scour available databases and apply substantial knowledge of statistics. However, recent advances in the expressiveness of databases, and increases in their size and complexity, provide the necessary conditions to automatically identify QEDs. In this paper, we describe the first system to discover knowledge by applying quasi-experimental designs that were identified automatically. We demonstrate that QEDs can be identified in a traditional database schema and that such identification requires only a small number of extensions to that schema, knowledge about quasi-experimental design encoded in first-order logic, and a theorem-proving engine. We describe several key innovations necessary to enable this system, including methods for automatically constructing appropriate experimental units and for creating aggregate variables on those units. We show that applying the resulting designs can identify important causal dependencies in real domains, and we provide examples from academic publishing, movie making and marketing, and peer-production systems. Finally, we discuss the integration of QEDs with other approaches to causal discovery, including joint modeling and directed experimentation.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Jensen, Andrew Fast, Brian Taylor, Marc Maier
Automatic Identification of Quasi-Experimental Designs for Discovering Causal Knowledge Proceedings Article
In: Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, pp. 372–380, Association for Computing Machinery, Las Vegas, Nevada, USA, 2008, ISBN: 9781605581934.
Abstract | Links | BibTeX | Tags: causal discovery, Causal Modeling, quasi-experimental design
@inproceedings{10.1145/1401890.1401938,
title = {Automatic Identification of Quasi-Experimental Designs for Discovering Causal Knowledge},
author = {David Jensen and Andrew Fast and Brian Taylor and Marc Maier},
url = {https://doi.org/10.1145/1401890.1401938},
doi = {10.1145/1401890.1401938},
isbn = {9781605581934},
year = {2008},
date = {2008-01-01},
booktitle = {Proceedings of the 14th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},
pages = {372–380},
publisher = {Association for Computing Machinery},
address = {Las Vegas, Nevada, USA},
series = {KDD '08},
abstract = {Researchers in the social and behavioral sciences routinely rely on quasi-experimental designs to discover knowledge from large data-bases. Quasi-experimental designs (QEDs) exploit fortuitous circumstances in non-experimental data to identify situations (sometimes called "natural experiments") that provide the equivalent of experimental control and randomization. QEDs allow researchers in domains as diverse as sociology, medicine, and marketing to draw reliable inferences about causal dependencies from non-experimental data. Unfortunately, identifying and exploiting QEDs has remained a painstaking manual activity, requiring researchers to scour available databases and apply substantial knowledge of statistics. However, recent advances in the expressiveness of databases, and increases in their size and complexity, provide the necessary conditions to automatically identify QEDs. In this paper, we describe the first system to discover knowledge by applying quasi-experimental designs that were identified automatically. We demonstrate that QEDs can be identified in a traditional database schema and that such identification requires only a small number of extensions to that schema, knowledge about quasi-experimental design encoded in first-order logic, and a theorem-proving engine. We describe several key innovations necessary to enable this system, including methods for automatically constructing appropriate experimental units and for creating aggregate variables on those units. We show that applying the resulting designs can identify important causal dependencies in real domains, and we provide examples from academic publishing, movie making and marketing, and peer-production systems. Finally, we discuss the integration of QEDs with other approaches to causal discovery, including joint modeling and directed experimentation.},
keywords = {causal discovery, Causal Modeling, quasi-experimental design},
pubstate = {published},
tppubtype = {inproceedings}
}
Andrew Fast, Michael Hay, David Jensen
Improving accuracy of constraint-based structure learning Technical Report
Technical report 08-48, University of Massachusetts Amherst, Computer~… 2008.
Abstract | Links | BibTeX | Tags:
@techreport{fast2008improving,
title = {Improving accuracy of constraint-based structure learning},
author = {Andrew Fast and Michael Hay and David Jensen},
url = {https://www.researchgate.net/profile/David-Jensen-10/publication/228854891_Improving_Accuracy_of_Constraint-Based_Structure_Learning/links/09e41510892d741c18000000/Improving-Accuracy-of-Constraint-Based-Structure-Learning.pdf},
year = {2008},
date = {2008-01-01},
institution = {Technical report 08-48, University of Massachusetts Amherst, Computer~…},
abstract = {Hybrid algorithms for learning the structure of Bayesian networks combine techniques from both the constraintbased and search-and-score paradigms of structure learning. One class of hybrid approaches uses a constraintbased algorithm to learn an undirected skeleton identifying edges that should appear in the final network. This skeleton is used to constrain the model space considered by a search-and-score algorithm to orient the edges and produce a final model structure. At small sample sizes, the performance of models learned using this hybrid approach do not achieve likelihood as high as models learned by unconstrained search. Low performance is a result of errors made by the skeleton identification algorithm, particularly false negative errors, which lead to an over-constrained search space. These errors are often attributed to “noisy” hypothesis tests that are run during skeleton identification. However, at least three specific sources of error have been identified in the literature: unsuitable hypothesis tests, lowpower hypothesis tests, and unexplained d-separation. No previous work has considered these sources of error in combination. We determine the relative importance of each source individually and in combination. We identify that low-power tests are the primary source of false negative errors, and show that these errors can be corrected by a novel application of statistical power analysis. The result is a new hybrid algorithm for learning the structure of Bayesian networks which produces models with equivalent likelihood to models produced by unconstrained greedy search, using only a fraction of the time.},
keywords = {},
pubstate = {published},
tppubtype = {techreport}
}