Publications
2021
Terrance E. Boult, Przemyslaw A. Grabowicz, D. S. Prijatelj, R. Stern, L. Holder, J. Alspector, M. Jafarzadeh, T. Ahmad, A. R. Dhamija, C. Li, S. Cruz, A. Shrivastava, C. Vondrick, W. J. Scheirer
A Unifying Framework for Formal Theories of Novelty:Framework, Examples and Discussion Proceedings Article
In: AAAI'21 SMPT, 2021, ISSN: 23318422.
@inproceedings{Boult2020a,
title = {A Unifying Framework for Formal Theories of Novelty:Framework, Examples and Discussion},
author = {Terrance E. Boult and Przemyslaw A. Grabowicz and D. S. Prijatelj and R. Stern and L. Holder and J. Alspector and M. Jafarzadeh and T. Ahmad and A. R. Dhamija and C. Li and S. Cruz and A. Shrivastava and C. Vondrick and W. J. Scheirer},
url = {http://arxiv.org/abs/2012.04226},
issn = {23318422},
year = {2021},
date = {2021-12-01},
booktitle = {AAAI'21 SMPT},
abstract = {Managing inputs that are novel, unknown, or out-of-distribution is critical as an agent moves from the lab to the open world. Novelty-related problems include being tolerant to novel perturbations of the normal input, detecting when the input includes novel items, and adapting to novel inputs. While significant research has been undertaken in these areas, a noticeable gap exists in the lack of a formalized definition of novelty that transcends problem domains. As a team of researchers spanning multiple research groups and different domains, we have seen, first hand, the difficulties that arise from ill-specified novelty problems, as well as inconsistent definitions and terminology. Therefore, we present the first unified framework for formal theories of novelty and use the framework to formally define a family of novelty types. Our framework can be applied across a wide range of domains, from symbolic AI to reinforcement learning, and beyond to open world image recognition. Thus, it can be used to help kick-start new research efforts and accelerate ongoing work on these important novelty-related problems. This extended version of our AAAI 2021 paper included more details and examples in multiple domains.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Aarshee Mishra, Przemyslaw A. Grabowicz, Nicholas Perello
Towards Fair and Explainable Supervised Learning Proceedings Article
In: ICML Workshop on Socially Responsible Machine Learning, 2021.
@inproceedings{Mishra2021,
title = {Towards Fair and Explainable Supervised Learning},
author = {Aarshee Mishra and Przemyslaw A. Grabowicz and Nicholas Perello},
url = {https://drive.google.com/file/d/1z24hITF0Xrlc_IX_rOZVZ2aigOj1hxhD/view?usp=sharing},
year = {2021},
date = {2021-01-01},
booktitle = {ICML Workshop on Socially Responsible Machine Learning},
abstract = {Algorithms that aid human decision-making may inadvertently discriminate against certain protected groups. We formalize direct discrimination as a direct causal effect of the protected attributes on the decisions, while textitinduced indirect discrimination as a change in the influence of non-protected features associated with the protected attributes. The measurements of average treatment effect (ATE) and SHapley Additive exPlanations (SHAP) reveal that state-of-the-art fair learning methods can inadvertently induce indirect discrimination in synthetic and real-world datasets. To inhibit discrimination in algorithmic systems, we propose to nullify the influence of the protected attribute on the output of the system, while preserving the influence of remaining features. To achieve this objective, we introduce a risk minimization method which optimizes for the proposed fairness objective. We show that the method leverages model accuracy and disparity measures.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Amanda M Gentzel, Purva Pruthi, David Jensen
How and Why to Use Experimental Data to Evaluate Methods for Observational Causal Inference Proceedings Article
In: International Conference on Machine Learning, pp. 3660–3671, PMLR 2021.
@inproceedings{gentzel2021and,
title = {How and Why to Use Experimental Data to Evaluate Methods for Observational Causal Inference},
author = {Amanda M Gentzel and Purva Pruthi and David Jensen},
url = {http://proceedings.mlr.press/v139/gentzel21a/gentzel21a.pdf},
year = {2021},
date = {2021-01-01},
booktitle = {International Conference on Machine Learning},
pages = {3660--3671},
organization = {PMLR},
abstract = {Methods that infer causal dependence from observational data are central to many areas of science, including medicine, economics, and the social sciences. A variety of theoretical properties of these methods have been proven, but empirical evaluation remains a challenge, largely due to the lack of observational data sets for which treatment effect is known. We describe and analyze observational sampling from randomized controlled trials (OSRCT), a method for evaluating causal inference methods using data from randomized controlled trials (RCTs). This method can be used to create constructed observational data sets with corresponding unbiased estimates of treatment effect, substantially increasing the number of data sets available for evaluating causal inference methods. We show that, in expectation, OSRCT creates data sets that are equivalent to those produced by randomly sampling from empirical data sets in which all potential outcomes are available. We then perform a large-scale evaluation of seven causal inference methods over 37 data sets, drawn from RCTs, as well as simulators, real-world computational systems, and observational data sets augmented with a synthetic response variable. We find notable performance differences when comparing across data from different sources, demonstrating the importance of using data from a variety of sources when evaluating any causal inference method.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Jensen
Improving Causal Inference by Increasing Model Expressiveness Proceedings Article
In: Proceedings of the AAAI Conference on Artificial Intelligence, pp. 15053–15057, 2021.
@inproceedings{jensen2021improving,
title = {Improving Causal Inference by Increasing Model Expressiveness},
author = {David Jensen},
url = {https://www.aaai.org/AAAI21Papers/SMT-427.JensenD.pdf},
year = {2021},
date = {2021-01-01},
booktitle = {Proceedings of the AAAI Conference on Artificial Intelligence},
volume = {35},
number = {17},
pages = {15053--15057},
abstract = {The ability to learn and reason with causal knowledge is a key aspect of intelligent behavior. In contrast to mere statistical association, knowledge of causation enables reasoning about the effects of actions. Causal reasoning is vital for autonomous agents and for a range of applications in science, medicine, business, and government. However, current methods for causal inference are hobbled because they use relatively inexpressive models. Surprisingly, current causal models eschew nearly every major representational innovation common in a range of other fields both inside and outside of computer science, including representation of objects, relationships, time, space, and hierarchy. Even more surprisingly, a range of recent research provides strong evidence that more expressive representations make possible causal inferences that are otherwise impossible and remove key biases that would otherwise afflict more naive inferences. New research on causal inference should target increases in expressiveness to improve accuracy and effectiveness.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Akanksha Atrey, Prashant J. Shenoy, David Jensen
Preserving Privacy in Personalized Models for Distributed Mobile Services Miscellaneous
2021.
@misc{DBLP:journals/corr/abs-2101-05855,
title = {Preserving Privacy in Personalized Models for Distributed Mobile Services},
author = {Akanksha Atrey and Prashant J. Shenoy and David Jensen},
url = {https://arxiv.org/abs/2101.05855},
year = {2021},
date = {2021-01-01},
journal = {CoRR},
volume = {abs/2101.05855},
abstract = {The ubiquity of mobile devices has led to the proliferation of mobile services that provide personalized and context-aware content to their users. Modern mobile services are distributed between end-devices, such as smartphones, and remote servers that reside in the cloud. Such services thrive on their ability to predict future contexts to pre-fetch content or make context-specific recommendations. An increasingly common method to predict future contexts, such as location, is via machine learning (ML) models. Recent work in context prediction has focused on ML model personalization where a personalized model is learned for each individual user in order to tailor predictions or recommendations to a user's mobile behavior. While the use of personalized models increases efficacy of the mobile service, we argue that it increases privacy risk since a personalized model encodes contextual behavior unique to each user. To demonstrate these privacy risks, we present several attribute inference-based privacy attacks and show that such attacks can leak privacy with up to 78% efficacy for top-3 predictions. We present Pelican, a privacy-preserving personalization system for context-aware mobile services that leverages both device and cloud resources to personalize ML models while minimizing the risk of privacy leakage for users. We evaluate Pelican using real world traces for location-aware mobile services and show that Pelican can substantially reduce privacy leakage by up to 75%.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Sam Witty, David Jensen, Vikash Mansinghka
A Simulation-Based Test of Identifiability for Bayesian Causal Inference Miscellaneous
2021.
@misc{DBLP:journals/corr/abs-2102-11761,
title = {A Simulation-Based Test of Identifiability for Bayesian Causal Inference},
author = {Sam Witty and David Jensen and Vikash Mansinghka},
url = {https://arxiv.org/abs/2102.11761},
year = {2021},
date = {2021-01-01},
journal = {CoRR},
volume = {abs/2102.11761},
abstract = {This paper introduces a procedure for testing the identifiability of Bayesian models for causal inference. Although the do-calculus is sound and complete given a causal graph, many practical assumptions cannot be expressed in terms of graph structure alone, such as the assumptions required by instrumental variable designs, regression discontinuity designs, and within-subjects designs. We present simulation-based identifiability (SBI), a fully automated identification test based on a particle optimization scheme with simulated observations. This approach expresses causal assumptions as priors over functions in a structural causal model, including flexible priors using Gaussian processes. We prove that SBI is asymptotically sound and complete, and produces practical finite-sample bounds. We also show empirically that SBI agrees with known results in graph-based identification as well as with widely-held intuitions for designs in which graph-based methods are inconclusive.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2020
David Ifeoluwa Adelani, Ryota Kobayashi, Ingmar Weber, Przemyslaw A. Grabowicz
Estimating community feedback effect on topic choice in social media with predictive modeling Journal Article
In: EPJ Data Science, vol. 9, no. 1, pp. 25, 2020, ISSN: 2193-1127.
@article{Adelani2020,
title = {Estimating community feedback effect on topic choice in social media with predictive modeling},
author = {David Ifeoluwa Adelani and Ryota Kobayashi and Ingmar Weber and Przemyslaw A. Grabowicz},
url = {http://dx.doi.org/10.1140/epjds/s13688-020-00243-w https://epjdatascience.springeropen.com/articles/10.1140/epjds/s13688-020-00243-w},
doi = {10.1140/epjds/s13688-020-00243-w},
issn = {2193-1127},
year = {2020},
date = {2020-12-01},
journal = {EPJ Data Science},
volume = {9},
number = {1},
pages = {25},
publisher = {The Author(s)},
abstract = {Social media users post content on various topics. A defining feature of social media is that other users can provide feedback—called community feedback—to their content in the form of comments, replies, and retweets. We hypothesize that the amount of received feedback influences the choice of topics on which a social media user posts. However, it is challenging to test this hypothesis as user heterogeneity and external confounders complicate measuring the feedback effect. Here, we investigate this hypothesis with a predictive approach based on an interpretable model of an author's decision to continue the topic of their previous post. We explore the confounding factors, including author's topic preferences and unobserved external factors such as news and social events, by optimizing the predictive accuracy. This approach enables us to identify which users are susceptible to community feedback. Overall, we find that 33% and 14% of active users in Reddit and Twitter, respectively, are influenced by community feedback. The model suggests that this feedback alters the probability of topic continuation up to 14%, depending on the user and the amount of feedback.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Jensen, Javier Burroni, Matthew Rattigan
Object conditioning for causal inference Proceedings Article
In: Uncertainty in Artificial Intelligence, pp. 1072–1082, PMLR 2020.
@inproceedings{jensen2020object,
title = {Object conditioning for causal inference},
author = {David Jensen and Javier Burroni and Matthew Rattigan},
url = {http://proceedings.mlr.press/v115/jensen20a/jensen20a.pdf},
year = {2020},
date = {2020-01-01},
booktitle = {Uncertainty in Artificial Intelligence},
pages = {1072--1082},
organization = {PMLR},
abstract = {We describe and analyze a form of conditioning that is widely applied within social science and applied statistics but that is virtually unknown within causal graphical models. This approach, which we term object conditioning, can adjust for the effects of latent confounders and yet avoid the pitfall of conditioning on colliders. We describe object conditioning using plate models and show how its probabilistic implications can be explained using the property of exchangeability. We show that several seemingly obvious interpretations of object conditioning are insufficient to describe its probabilistic implications. Finally, we use object conditioning to describe and unify key aspects of a diverse set of techniques for causal inference, including within-subjects designs, difference-in-differences designs, and interrupted time-series designs.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Akanksha Atrey, Kaleigh Clary, David Jensen
Exploratory Not Explanatory: Counterfactual Analysis of Saliency Maps for Deep Reinforcement Learning Proceedings Article
In: International Conference on Learning Representations, 2020.
@inproceedings{atrey2020exploratory,
title = {Exploratory Not Explanatory: Counterfactual Analysis of Saliency Maps for Deep Reinforcement Learning},
author = {Akanksha Atrey and Kaleigh Clary and David Jensen},
url = {https://openreview.net/pdf?id=rkl3m1BFDB},
year = {2020},
date = {2020-01-01},
booktitle = {International Conference on Learning Representations},
abstract = {Saliency maps are frequently used to support explanations of the behavior of deep reinforcement learning (RL) agents. However, a review of how saliency maps are used in practice indicates that the derived explanations are often unfalsifiable and can be highly subjective. We introduce an empirical approach grounded in counterfactual reasoning to test the hypotheses generated from saliency maps and assess the degree to which they correspond to the semantics of RL environments. We use Atari games, a common benchmark for deep RL, to evaluate three types of saliency maps. Our results show the extent to which existing claims about Atari games can be evaluated and suggest that saliency maps are best viewed as an exploratory tool rather than an explanatory tool.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Katherine A. Keith, David Jensen, Brendan O'Connor
Text and Causal Inference: A Review of Using Text to Remove Confounding from Causal Estimates Proceedings Article
In: Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, ACL 2020, Online, July 5-10, 2020, pp. 5332–5344, Association for Computational Linguistics, 2020.
@inproceedings{DBLP:conf/acl/KeithJO20,
title = {Text and Causal Inference: A Review of Using Text to Remove Confounding
from Causal Estimates},
author = {Katherine A. Keith and David Jensen and Brendan O'Connor},
url = {https://doi.org/10.18653/v1/2020.acl-main.474},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational
Linguistics, ACL 2020, Online, July 5-10, 2020},
pages = {5332--5344},
publisher = {Association for Computational Linguistics},
abstract = {Many applications of computational social science aim to infer causal conclusions from non-experimental data. Such observational data often contains confounders, variables that influence both potential causes and potential effects. Unmeasured or latent confounders can bias causal estimates, and this has motivated interest in measuring potential confounders from observed text. For example, an individual's entire history of social media posts or the content of a news article could provide a rich measurement of multiple confounders. Yet, methods and applications for this problem are scattered across different communities and evaluation practices are inconsistent. This review is the first to gather and categorize these examples and provide a guide to data-processing and evaluation decisions. Despite increased attention on adjusting for confounding using text, there are still many open problems, which we highlight in this paper.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sam Witty, Kenta Takatsu, David Jensen, Vikash Mansinghka
Causal Inference using Gaussian Processes with Structured Latent Confounders Proceedings Article
In: Proceedings of the 37th International Conference on Machine Learning, ICML 2020, 13-18 July 2020, Virtual Event, pp. 10313–10323, PMLR, 2020.
@inproceedings{DBLP:conf/icml/WittyTJM20,
title = {Causal Inference using Gaussian Processes with Structured Latent Confounders},
author = {Sam Witty and Kenta Takatsu and David Jensen and Vikash Mansinghka},
url = {http://proceedings.mlr.press/v119/witty20a.html},
year = {2020},
date = {2020-01-01},
booktitle = {Proceedings of the 37th International Conference on Machine Learning,
ICML 2020, 13-18 July 2020, Virtual Event},
volume = {119},
pages = {10313--10323},
publisher = {PMLR},
series = {Proceedings of Machine Learning Research},
abstract = {Latent confounders---unobserved variables that influence both treatment and outcome---can bias estimates of causal effects. In some cases, these confounders are shared across observations, e.g. all students taking a course are influenced by the course's difficulty in addition to any educational interventions they receive individually. This paper shows how to semiparametrically model latent confounders that have this structure and thereby improve estimates of causal effects. The key innovations are a hierarchical Bayesian model, Gaussian processes with structured latent confounders (GP-SLC), and a Monte Carlo inference algorithm for this model based on elliptical slice sampling. GP-SLC provides principled Bayesian uncertainty estimates of individual treatment effect with minimal assumptions about the functional forms relating confounders, covariates, treatment, and outcome. Finally, this paper shows GP-SLC is competitive with or more accurate than widely used causal inference techniques on three benchmark datasets, including the Infant Health and Development Program and a dataset showing the effect of changing temperatures on state-wide energy consumption across New England.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Amanda Gentzel, Justin Clarke, David Jensen
Using Experimental Data to Evaluate Methods for Observational Causal Inference Miscellaneous
2020.
@misc{DBLP:journals/corr/abs-2010-03051,
title = {Using Experimental Data to Evaluate Methods for Observational Causal
Inference},
author = {Amanda Gentzel and Justin Clarke and David Jensen},
url = {https://arxiv.org/abs/2010.03051},
year = {2020},
date = {2020-01-01},
journal = {CoRR},
volume = {abs/2010.03051},
abstract = {Methods that infer causal dependence from observational data are central to many areas of science, including medicine, economics, and the social sciences. A variety of theoretical properties of these methods have been proven, but empirical evaluation remains a challenge, largely due to the lack of observational data sets for which treatment effect is known. We propose and analyze observational sampling from randomized controlled trials (OSRCT), a method for evaluating causal inference methods using data from randomized controlled trials (RCTs). This method can be used to create constructed observational data sets with corresponding unbiased estimates of treatment effect, substantially increasing the number of data sets available for evaluating causal inference methods. We show that, in expectation, OSRCT creates data sets that are equivalent to those produced by randomly sampling from empirical data sets in which all potential outcomes are available. We analyze several properties of OSRCT theoretically and empirically, and we demonstrate its use by comparing the performance of four causal inference methods using data from eleven RCTs.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2019
Przemyslaw A. Grabowicz, Nicholas Perello, Kenta Takatsu
Resilience of Supervised Learning Algorithms to Discriminatory Data Perturbations Journal Article
In: 2019.
@article{Grabowicz2019c,
title = {Resilience of Supervised Learning Algorithms to Discriminatory Data Perturbations},
author = {Przemyslaw A. Grabowicz and Nicholas Perello and Kenta Takatsu},
url = {http://arxiv.org/abs/1912.08189},
year = {2019},
date = {2019-12-01},
abstract = {Discrimination is a focal concern in supervised learning algorithms augmenting human decision-making. These systems are trained using historical data, which may have been tainted by discrimination, and may learn biases against the protected groups. An important question is how to train models without propagating discrimination. In this study, we i) define and model discrimination as perturbations of a data-generating process and show how discrimination can be induced via attributes correlated with the protected attributes; ii) introduce a measure of resilience of a supervised learning algorithm to potentially discriminatory data perturbations, iii) propose a novel supervised learning algorithm that inhibits discrimination, and iv) show that it is more resilient to discriminatory perturbations in synthetic and real-world datasets than state-of-the-art learning algorithms. The proposed method can be used with general supervised learning algorithms and avoids inducement of discrimination, while maximizing model accuracy.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
David Jensen, others
Comment: Strengthening empirical evaluation of causal inference methods Journal Article
In: Statistical Science, vol. 34, no. 1, pp. 77–81, 2019.
@article{jensen2019comment,
title = {Comment: Strengthening empirical evaluation of causal inference methods},
author = {David Jensen and others},
url = {https://projecteuclid.org/journals/statistical-science/volume-34/issue-1/Comment-Strengthening-Empirical-Evaluation-of-Causal-Inference-Methods/10.1214/18-STS690.short},
year = {2019},
date = {2019-01-01},
journal = {Statistical Science},
volume = {34},
number = {1},
pages = {77--81},
publisher = {Institute of Mathematical Statistics},
abstract = {This is a contribution to the discussion of the paper by Dorie et al. (Statist. Sci. 34 (2019) 43–68), which reports the lessons learned from 2016 Atlantic Causal Inference Conference Competition. My comments strongly support the authors’ focus on empirical evaluation, using examples and experience from machine learning research, particularly focusing on the problem of algorithmic complexity. I argue that even broader and deeper empirical evaluation should be undertaken by the researchers who study causal inference. Finally, I highlight a few key conclusions that suggest where future research should focus.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Emma Tosch, Eytan Bakshy, Emery D Berger, David Jensen, J Eliot B Moss
PlanAlyzer: assessing threats to the validity of online experiments Journal Article
In: Proceedings of the ACM on Programming Languages, vol. 3, no. OOPSLA, pp. 1–30, 2019.
@article{tosch2019planalyzer,
title = {PlanAlyzer: assessing threats to the validity of online experiments},
author = {Emma Tosch and Eytan Bakshy and Emery D Berger and David Jensen and J Eliot B Moss},
url = {https://dl.acm.org/doi/pdf/10.1145/3360608},
year = {2019},
date = {2019-01-01},
journal = {Proceedings of the ACM on Programming Languages},
volume = {3},
number = {OOPSLA},
pages = {1--30},
publisher = {ACM New York, NY, USA},
abstract = {Online experiments are ubiquitous. As the scale of experiments has grown, so has the complexity of their design and implementation. In response, firms have developed software frameworks for designing and deploying online experiments. Ensuring that experiments in these frameworks are correctly designed and that their results are trustworthy---referred to as *internal validity*---can be difficult. Currently, verifying internal validity requires manual inspection by someone with substantial expertise in experimental design. We present the first approach for statically checking the internal validity of online experiments. Our checks are based on well-known problems that arise in experimental design and causal inference. Our analyses target PlanOut, a widely deployed, open-source experimentation framework that uses a domain-specific language to specify and run complex experiments. We have built a tool, PlanAlyzer, that checks PlanOut programs for a variety of threats to internal validity, including failures of randomization, treatment assignment, and causal sufficiency. PlanAlyzer uses its analyses to automatically generate *contrasts*, a key type of information required to perform valid statistical analyses over experimental results. We demonstrate PlanAlyzer's utility on a corpus of PlanOut scripts deployed in production at Facebook, and we evaluate its ability to identify threats to validity on a mutated subset of this corpus. PlanAlyzer has both precision and recall of 92% on the mutated corpus, and 82% of the contrasts it automatically generates match hand-specified data.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Huseyin Oktay, Akanksha Atrey, David Jensen
Identifying when effect restoration will improve estimates of causal effect Proceedings Article
In: Proceedings of the 2019 SIAM International Conference on Data Mining, pp. 190–198, Society for Industrial and Applied Mathematics 2019.
@inproceedings{oktay2019identifying,
title = {Identifying when effect restoration will improve estimates of causal effect},
author = {Huseyin Oktay and Akanksha Atrey and David Jensen},
url = {https://doi.org/10.1137/1.9781611975673.22},
year = {2019},
date = {2019-01-01},
booktitle = {Proceedings of the 2019 SIAM International Conference on Data Mining},
pages = {190--198},
organization = {Society for Industrial and Applied Mathematics},
abstract = {Several methods have been developed that combine multiple models learned on different data sets and then use that combination to reach conclusions that would not have been possible with any one of the models alone. We examine one such method—effect restoration—which was originally developed to mitigate the effects of poorly measured confounding variables in a causal model. We show how effect restoration can be used to combine results from different machine learning models and how the combined model can be used to estimate causal effects that are not identifiable from either of the original studies alone. We characterize the performance of effect restoration by using both theoretical analysis and simulation studies. Specifically, we show how conditional independence tests and common assumptions can help distinguish when effect restoration should and should not be applied, and we use empirical analysis to show the limited range of conditions under which effect restoration should be applied in practical situations.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Amanda Gentzel, Dan Garant, David Jensen
The Case for Evaluating Causal Models Using Interventional Measures and Empirical Data Proceedings Article
In: Advances in Neural Information Processing Systems, Curran Associates, Inc., 2019.
@inproceedings{gentzel2019case,
title = {The Case for Evaluating Causal Models Using Interventional Measures and Empirical Data},
author = {Amanda Gentzel and Dan Garant and David Jensen},
url = {https://proceedings.neurips.cc/paper/2019/file/a87c11b9100c608b7f8e98cfa316ff7b-Paper.pdf},
year = {2019},
date = {2019-01-01},
booktitle = {Advances in Neural Information Processing Systems},
volume = {32},
publisher = {Curran Associates, Inc.},
abstract = {Causal inference is central to many areas of artificial intelligence, including complex reasoning, planning, knowledge-base construction, robotics, explanation, and fairness. An active community of researchers develops and enhances algorithms that learn causal models from data, and this work has produced a series of impressive technical advances. However, evaluation techniques for causal modeling algorithms have remained somewhat primitive, limiting what we can learn from experimental studies of algorithm performance, constraining the types of algorithms and model representations that researchers consider, and creating a gap between theory and practice. We argue for more frequent use of evaluation techniques that examine interventional measures rather than structural or observational measures, and that evaluate those measures on empirical data rather than synthetic data. We survey the current practice in evaluation and show that the techniques we recommend are rarely used in practice. We show that such techniques are feasible and that data sets are available to conduct such evaluations. We also show that these techniques produce substantially different results than using structural measures and synthetic data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Emma Tosch, Kaleigh Clary, John Foley, David Jensen
Toybox: A Suite of Environments for Experimental Evaluation of Deep Reinforcement Learning Miscellaneous
2019.
@misc{DBLP:journals/corr/abs-1905-02825,
title = {Toybox: A Suite of Environments for Experimental Evaluation of Deep
Reinforcement Learning},
author = {Emma Tosch and Kaleigh Clary and John Foley and David Jensen},
url = {http://arxiv.org/abs/1905.02825},
year = {2019},
date = {2019-01-01},
journal = {CoRR},
volume = {abs/1905.02825},
abstract = {Evaluation of deep reinforcement learning (RL) is inherently challenging. In particular, learned policies are largely opaque, and hypotheses about the behavior of deep RL agents are difficult to test in black-box environments. Considerable effort has gone into addressing opacity, but almost no effort has been devoted to producing high quality environments for experimental evaluation of agent behavior. We present TOYBOX, a new high-performance, open-source* subset of Atari environments re-designed for the experimental evaluation of deep RL. We show that TOYBOX enables a wide range of experiments and analyses that are impossible in other environments.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Sam Witty, Alexander Lew, David Jensen, Vikash Mansinghka
Bayesian causal inference via probabilistic program synthesis Miscellaneous
2019.
@misc{witty2019bayesian,
title = {Bayesian causal inference via probabilistic program synthesis},
author = {Sam Witty and Alexander Lew and David Jensen and Vikash Mansinghka},
url = {https://arxiv.org/pdf/1910.14124.pdf},
year = {2019},
date = {2019-01-01},
journal = {arXiv preprint arXiv:1910.14124},
abstract = {Causal inference can be formalized as Bayesian inference that combines a prior distribution over causal models and likelihoods that account for both observations and interventions. We show that it is possible to implement this approach using a sufficiently expressive probabilistic programming language. Priors are represented using probabilistic programs that generate source code in a domain specific language. Interventions are represented using probabilistic programs that edit this source code to modify the original generative process. This approach makes it straightforward to incorporate data from atomic interventions, as well as shift interventions, variance-scaling interventions, and other interventions that modify causal structure. This approach also enables the use of general-purpose inference machinery for probabilistic programs to infer probable causal structures and parameters from data. This abstract describes a prototype of this approach in the Gen probabilistic programming language.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2018
John Foley, Emma Tosch, Kaleigh Clary, David Jensen
Toybox: Better Atari Environments for Testing Reinforcement Learning Agents Proceedings Article
In: NeurIPS 2018 Workshop on Systems for ML, 2018.
@inproceedings{foley2018toybox,
title = {Toybox: Better Atari Environments for Testing Reinforcement Learning Agents},
author = {John Foley and Emma Tosch and Kaleigh Clary and David Jensen},
url = {http://learningsys.org/nips18/assets/papers/83CameraReadySubmissionNIPS_Systems_for_ML_Workshop_2019___ToyBox%20(11).pdf},
year = {2018},
date = {2018-01-01},
booktitle = {NeurIPS 2018 Workshop on Systems for ML},
abstract = {It is a widely accepted principle that software without tests has bugs. Testing reinforcement learning agents is especially difficult because of the stochastic nature of both agents and environments, the complexity of state-of-the-art models,and the sequential nature of their predictions. Recently, the Arcade Learning Environment (ALE) has become one of the most widely used benchmark suites for deep learning research, and state-of-the-art Reinforcement Learning (RL) agents have been shown to routinely equal or exceed human performance on many ALE tasks. Since ALE is based on emulation of original Atari games, the environment does not provide semantically meaningful representations of internal game state. This means that ALE has limited utility as an environment for supporting testing or model introspection. We propose TOYBOX, a collection of reimplementations of these games that solves this critical problem and enables robust testing of RL agents.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sam Witty, David Jensen
Causal Graphs vs. Causal Programs: The Case of Conditional Branching Proceedings Article
In: First Conference on Probabilistic Programming (ProbProg), 2018.
@inproceedings{witty2018causal,
title = {Causal Graphs vs. Causal Programs: The Case of Conditional Branching},
author = {Sam Witty and David Jensen},
url = {https://arxiv.org/pdf/2007.07127.pdf},
year = {2018},
date = {2018-01-01},
booktitle = {First Conference on Probabilistic Programming (ProbProg)},
abstract = {We evaluate the performance of graph-based causal discovery algorithms when the generative process is a probabilistic program with conditional branching. Using synthetic experiments, we demonstrate empirically that graph-based causal discovery algorithms are able to learn accurate associational distributions for probabilistic programs with contextsensitive structure, but that those graphs fail to accurately model the effects of interventions on the programs},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Sam Witty, Jun Ki Lee, Emma Tosch, Akanksha Atrey, Michael Littman, David Jensen
Measuring and characterizing generalization in deep reinforcement learning Miscellaneous
2018.
@misc{witty2018measuring,
title = {Measuring and characterizing generalization in deep reinforcement learning},
author = {Sam Witty and Jun Ki Lee and Emma Tosch and Akanksha Atrey and Michael Littman and David Jensen},
url = {https://arxiv.org/pdf/1812.02868.pdf},
year = {2018},
date = {2018-01-01},
journal = {arXiv preprint arXiv:1812.02868},
abstract = {Deep reinforcement-learning methods have achieved remarkable performance on challenging control tasks. Observations of the resulting behavior give the impression that the agent has constructed a generalized representation that supports insightful action decisions. We re-examine what is meant by generalization in RL, and propose several definitions based on an agent's performance in on-policy, off-policy, and unreachable states. We propose a set of practical methods for evaluating agents with these definitions of generalization. We demonstrate these techniques on a common benchmark task for deep RL, and we show that the learned networks make poor decisions for states that differ only slightly from on-policy states, even though those states are not selected adversarially. Taken together, these results call into question the extent to which deep Q-networks learn generalized representations, and suggest that more experimentation and analysis is necessary before claims of representation learning can be supported.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Kaleigh Clary, Emma Tosch, John Foley, David Jensen
Let's Play Again: Variability of Deep Reinforcement Learning Agents in Atari Environments Miscellaneous
2018.
@misc{clary2018variability,
title = {Let's Play Again: Variability of Deep Reinforcement Learning Agents in Atari Environments},
author = {Kaleigh Clary and Emma Tosch and John Foley and David Jensen},
url = {https://arxiv.org/pdf/1904.06312},
year = {2018},
date = {2018-01-01},
booktitle = {Critiquing and Correcting Trends in Machine Learning Workshop at Neural Information Processing Systems},
abstract = {Reproducibility in reinforcement learning is challenging: uncontrolled stochasticity from many sources, such as the learning algorithm, the learned policy, and the environment itself have led researchers to report the performance of learned agents using aggregate metrics of performance over multiple random seeds for a single environment. Unfortunately, there are still pernicious sources of variability in reinforcement learning agents that make reporting common summary statistics an unsound metric for performance. Our experiments demonstrate the variability of common agents used in the popular OpenAI Baselines repository. We make the case for reporting post-training agent performance as a distribution, rather than a point estimate.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2017
Kaleigh Clary, David Jensen
A/B Testing in Networks with Adversarial Members Journal Article
In: 2017.
@article{clary2017b,
title = {A/B Testing in Networks with Adversarial Members},
author = {Kaleigh Clary and David Jensen},
url = {http://www.mlgworkshop.org/2017/paper/MLG2017_paper_27.pdf},
year = {2017},
date = {2017-01-01},
abstract = {Many researchers attempt to study the effects of interventions in network systems. To simplify experimental design and analysis in these environments, simple assumptions are made about the behavior of its members. However, nodes may not respond to treatment, or may respond maliciously. These adversarial nodes influence treatment topology by preventing or altering the expected network effect, but may not be known or detectable. We characterize the influence of adversarial nodes and the bias these nodes introduce in average treatment effect estimates. In particular, we derive expressions for the bias induced in average treatment effect using the linear estimator from Gui et al (2015). In addition to theoretical bounds, we empirically demonstrate estimation bias through experiments on synthetically generated networks. We consider both the case in which adversarial nodes are dispersed randomly through the network and the case where adversarial node placement is targeted to the highest degree nodes. Our work demonstrates that peer influence makes causal estimates on networks susceptible to the actions of adversaries, and specific network structures are particularly vulnerable to to adversarial responses.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Javier Burroni, Arjun Guha, David Jensen
INTERACTIVE WRITING AND DEBUGGING OF BAYESIAN PROBABILISTIC PROGRAMS Journal Article
In: 2017.
@article{burroni2017interactive,
title = {INTERACTIVE WRITING AND DEBUGGING OF BAYESIAN PROBABILISTIC PROGRAMS},
author = {Javier Burroni and Arjun Guha and David Jensen},
url = {https://pps2018.luddy.indiana.edu/files/2017/12/interactive_debugger.pdf},
year = {2017},
date = {2017-01-01},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Katerina Marazopoulou, David Arbour, David Jensen
On causal analysis for heterogeneous networks Proceedings Article
In: The 2017 ACM SIGKDD Workshop on Causal Discovery, 2017.
@inproceedings{marazopoulou2017causal,
title = {On causal analysis for heterogeneous networks},
author = {Katerina Marazopoulou and David Arbour and David Jensen},
url = {http://nugget.unisa.edu.au/CD2017/slides/KaterinaMarazopoulou.pdf},
year = {2017},
date = {2017-01-01},
booktitle = {The 2017 ACM SIGKDD Workshop on Causal Discovery},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Kaleigh Clary, Andrew McGregor, David Jensen
A/B Testing in Networks with Adversarial Nodes Proceedings Article
In: KDD Workshop on Mining and Learning with Graphs, 2017.
@inproceedings{clary2017adversaries,
title = {A/B Testing in Networks with Adversarial Nodes},
author = {Kaleigh Clary and Andrew McGregor and David Jensen},
year = {2017},
date = {2017-01-01},
booktitle = {KDD Workshop on Mining and Learning with Graphs},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2016
David Arbour, Dan Garant, David Jensen
Inferring Network Effects from Observational Data Proceedings Article
In: Proceedings of the 22nd ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, San Francisco, CA, USA, August 13-17, 2016, pp. 715–724, ACM, 2016.
@inproceedings{DBLP:conf/kdd/ArbourGJ16,
title = {Inferring Network Effects from Observational Data},
author = {David Arbour and Dan Garant and David Jensen},
url = {https://doi.org/10.1145/2939672.2939791},
doi = {10.1145/2939672.2939791},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 22nd ACM SIGKDD International Conference on
Knowledge Discovery and Data Mining, San Francisco, CA, USA, August
13-17, 2016},
pages = {715--724},
publisher = {ACM},
abstract = {We present Relational Covariate Adjustment (RCA), a general method for estimating causal effects in relational data. Relational Covariate Adjustment is implemented through two high-level operations: identification of an adjustment set and relational regression adjustment. The former is achieved through an extension of Pearl’s back-door criterion to relational domains. We demonstrate how this extended definition can be used to estimate causal effects in the presence of network interference and confounding. RCA is agnostic to functional form, and it can easily model both discrete and continuous treatments as well as estimate the effects of a wider array of network interventions than existing experimental approaches. We show that RCA can yield robust estimates of causal effects using common regression models without extensive parameter tuning. Through a series of simulation experiments on a variety of synthetic and real- world network structures, we show that causal effects estimated on observational data with RCA are nearly as accurate as those estimated from well-designed network experiments.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Arbour, Katerina Marazopoulou, David Jensen
Inferring Causal Direction from Relational Data Proceedings Article
In: Proceedings of the Thirty-Second Conference on Uncertainty in Artificial Intelligence, UAI 2016, June 25-29, 2016, New York City, NY, USA, AUAI Press, 2016.
@inproceedings{DBLP:conf/uai/ArbourMJ16,
title = {Inferring Causal Direction from Relational Data},
author = {David Arbour and Katerina Marazopoulou and David Jensen},
url = {http://auai.org/uai2016/proceedings/papers/217.pdf},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the Thirty-Second Conference on Uncertainty in Artificial
Intelligence, UAI 2016, June 25-29, 2016, New York City, NY, USA},
publisher = {AUAI Press},
abstract = {Inferring the direction of causal dependence from observational data is a fundamental problem in many scientific fields. Significant progress has been made in inferring causal direction from data that are independent and identically distributed (i.i.d.), but little is understood about this problem in the more general relational setting with multiple types of interacting entities. This work examines the task of inferring the causal direction of peer dependence in relational data. We show that, in contrast to the i.i.d. setting, the direction of peer dependence can be inferred using simple procedures, regardless of the form of the underlying distribution, and we provide a theoretical characterization on the identifiability of direction. We then examine the conditions under which the presence of confounding can be detected. Finally, we demonstrate the efficacy of the proposed methods with synthetic experiments, and we provide an application on real-world data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Shiri Dori-Hacohen, David Jensen, James Allan
Controversy Detection in Wikipedia Using Collective Classification Proceedings Article
In: Proceedings of the 39th International ACM SIGIR conference on Research and Development in Information Retrieval, SIGIR 2016, Pisa, Italy, July 17-21, 2016, pp. 797–800, ACM, 2016.
@inproceedings{DBLP:conf/sigir/Dori-HacohenJA16,
title = {Controversy Detection in Wikipedia Using Collective Classification},
author = {Shiri Dori-Hacohen and David Jensen and James Allan},
url = {https://doi.org/10.1145/2911451.2914745},
doi = {10.1145/2911451.2914745},
year = {2016},
date = {2016-01-01},
booktitle = {Proceedings of the 39th International ACM SIGIR conference on
Research and Development in Information Retrieval, SIGIR 2016, Pisa,
Italy, July 17-21, 2016},
pages = {797--800},
publisher = {ACM},
abstract = {Concerns over personalization in IR have sparked an interest in detection and analysis of controversial topics. Accurate detection would enable many beneficial applications, such as alerting search users to controversy. Wikipedia's broad coverage and rich metadata offer a valuable resource for this problem. We hypothesize that intensities of controversy among related pages are not independent; thus, we propose a stacked model which exploits the dependencies among related pages. Our approach improves classification of controversial web pages when compared to a model that examines each page in isolation, demonstrating that controversial topics exhibit homophily. Using notions of similarity to construct a subnetwork for collective classification, rather than using the default network present in the relational data, leads to improved classification with wider applications for semi-structured datasets, with the effects most pronounced when a small set of neighbors is used.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Katerina Marazopoulou, Rumi Ghosh, Prasanth Lade, David Jensen
Causal Discovery for Manufacturing Domains Miscellaneous
2016.
@misc{DBLP:journals/corr/MarazopoulouGLJ16,
title = {Causal Discovery for Manufacturing Domains},
author = {Katerina Marazopoulou and Rumi Ghosh and Prasanth Lade and David Jensen},
url = {http://arxiv.org/abs/1605.04056},
year = {2016},
date = {2016-01-01},
journal = {CoRR},
volume = {abs/1605.04056},
abstract = {Yield and quality improvement is of paramount importance to any manufacturing company. One of the ways of improving yield is through discovery of the root causal factors affecting yield. We propose the use of data-driven interpretable causal models to identify key factors affecting yield. We focus on factors that are measured in different stages of production and testing in the manufacturing cycle of a product. We apply causal structure learning techniques on real data collected from this line. Specifically, the goal of this work is to learn interpretable causal models from observational data produced by manufacturing lines. Emphasis has been given to the interpretability of the models to make them actionable in the field of manufacturing. We highlight the challenges presented by assembly line data and propose ways to alleviate them.We also identify unique characteristics of data originating from assembly lines and how to leverage them in order to improve causal discovery. Standard evaluation techniques for causal structure learning shows that the learned causal models seem to closely represent the underlying latent causal relationship between different factors in the production process. These results were also validated by manufacturing domain experts who found them promising. This work demonstrates how data mining and knowledge discovery can be used for root cause analysis in the domain of manufacturing and connected industry.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
Dan Garant, David Jensen
Evaluating causal models by comparing interventional distributions Miscellaneous
2016.
@misc{garant2016evaluating,
title = {Evaluating causal models by comparing interventional distributions},
author = {Dan Garant and David Jensen},
url = {https://arxiv.org/abs/1608.04698},
year = {2016},
date = {2016-01-01},
journal = {arXiv preprint arXiv:1608.04698},
abstract = {The predominant method for evaluating the quality of causal models is to measure the graphical accuracy of the learned model structure. We present an alternative method for evaluating causal models that directly measures the accuracy of estimated interventional distributions. We contrast such distributional measures with structural measures, such as structural Hamming distance and structural intervention distance, showing that structural measures often correspond poorly to the accuracy of estimated interventional distributions. We use a number of real and synthetic datasets to illustrate various scenarios in which structural measures provide misleading results with respect to algorithm selection and parameter tuning, and we recommend that distributional measures become the new standard for evaluating causal models.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2015
Phillip B. Kirlin, David Jensen
Learning to Uncover Deep Musical Structure Proceedings Article
In: Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence, January 25-30, 2015, Austin, Texas, USA, pp. 1770–1776, AAAI Press, 2015.
@inproceedings{DBLP:conf/aaai/KirlinJ15,
title = {Learning to Uncover Deep Musical Structure},
author = {Phillip B. Kirlin and David Jensen},
url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9757},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the Twenty-Ninth AAAI Conference on Artificial Intelligence,
January 25-30, 2015, Austin, Texas, USA},
pages = {1770--1776},
publisher = {AAAI Press},
abstract = {The overarching goal of music theory is to explain the inner workings of a musical composition by examining the structure of the composition. Schenkerian music theory supposes that Western tonal compositions can be viewed as hierarchies of musical objects. The process of Schenkerian analysis reveals this hierarchy by identifying connections between notes or chords of a composition that illustrate both the small- and large-scale construction of the music. We present a new probabilistic model of this variety of music analysis, details of how the parameters of the model can be learned from a corpus, an algorithm for deriving the most probable analysis for a given piece of music, and both quantitative and human-based evaluations of the algorithm's performance. This represents the first large-scale data-driven computational approach to hierarchical music analysis.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Jerod J. Weinman, David Jensen, David Lopatto
Teaching Computing as Science in a Research Experience Proceedings Article
In: Proceedings of the 46th ACM Technical Symposium on Computer Science Education, SIGCSE 2015, Kansas City, MO, USA, March 4-7, 2015, pp. 24–29, ACM, 2015.
@inproceedings{DBLP:conf/sigcse/WeinmanJL15,
title = {Teaching Computing as Science in a Research Experience},
author = {Jerod J. Weinman and David Jensen and David Lopatto},
url = {https://doi.org/10.1145/2676723.2677231},
doi = {10.1145/2676723.2677231},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the 46th ACM Technical Symposium on Computer Science
Education, SIGCSE 2015, Kansas City, MO, USA, March 4-7, 2015},
pages = {24--29},
publisher = {ACM},
abstract = {Many instructors and institutions offer research experiences and training in computing research methods. However, in a national survey, we find that undergraduate students rate their computing research experiences lower than students in other STEM fields. To address this learning gap, we have offered summer undergraduate research experiences in computing that include not only instruction in the important mechanics of research but also grounding in a philosophy of computing science that emphasizes generalized explanation of behavior as a means for control and prediction. After five years, survey results indicate the experience helps close the gap between CS and other STEM fields in benefits gained.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Katerina Marazopoulou, Marc Maier, David Jensen
Learning the Structure of Causal Models with Relational and Temporal Dependence Proceedings Article
In: Proceedings of the UAI 2015 Workshop on Advances in Causal Inference co-located with the 31st Conference on Uncertainty in Artificial Intelligence (UAI 2015), Amsterdam, The Netherlands, July 16, 2015, pp. 66–75, CEUR-WS.org, 2015.
@inproceedings{DBLP:conf/uai/MarazopoulouMJ15,
title = {Learning the Structure of Causal Models with Relational and Temporal
Dependence},
author = {Katerina Marazopoulou and Marc Maier and David Jensen},
url = {http://ceur-ws.org/Vol-1504/uai2015aci_paper6.pdf},
year = {2015},
date = {2015-01-01},
booktitle = {Proceedings of the UAI 2015 Workshop on Advances in Causal Inference
co-located with the 31st Conference on Uncertainty in Artificial Intelligence
(UAI 2015), Amsterdam, The Netherlands, July 16, 2015},
volume = {1504},
pages = {66--75},
publisher = {CEUR-WS.org},
series = {CEUR Workshop Proceedings},
abstract = {Many real-world domains are inherently relational and temporal—they consist of heterogeneous entities that interact with each other over time. Effective reasoning about causality in such domains requires representations that explicitly model relational and temporal dependence. In this work, we provide a formalization of temporal relational models. We define temporal extensions to abstract ground graphs—a lifted representation that abstracts paths of dependence over all possible ground graphs. Temporal abstract ground graphs enable a sound and complete method for answering d-separation queries on temporal relational models. These methods provide the foundation for a constraint-based algorithm, TRCD, that learns causal models from temporal relational data. We provide experimental evidence that demonstrates the need to explicitly represent time when inferring causal dependence. We also demonstrate the expressive gain of TRCD compared to earlier algorithms that do not explicitly represent time.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2014
Lisa Friedland, Amanda Gentzel, David Jensen
Classifier-adjusted density estimation for anomaly detection and one-class classification Proceedings Article
In: Proceedings of the 2014 SIAM International Conference on Data Mining, pp. 578–586, Society for Industrial and Applied Mathematics 2014.
@inproceedings{friedland2014classifier,
title = {Classifier-adjusted density estimation for anomaly detection and one-class classification},
author = {Lisa Friedland and Amanda Gentzel and David Jensen},
url = {https://epubs.siam.org/doi/pdf/10.1137/1.9781611973440.67},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the 2014 SIAM International Conference on Data Mining},
pages = {578--586},
organization = {Society for Industrial and Applied Mathematics},
abstract = {Density estimation methods are often regarded as unsuitable for anomaly detection in high-dimensional data due to the difficulty of estimating multivariate probability distributions. Instead, the scores from popular distance- and local-density-based methods, such as local outlier factor (LOF), are used as surrogates for probability densities. We question this infeasibility assumption and explore a family of simple statistically-based density estimates constructed by combining a probabilistic classifier with a naive density estimate. Across a number of semi-supervised and unsupervised problems formed from real-world data sets, we show that these methods are competitive with LOF and that even simple density estimates that assume attribute independence can perform strongly. We show that these density estimation methods scale well to data with high dimensionality and that they are robust to the problem of irrelevant attributes that plagues methods based on local estimates.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
David Arbour, Katerina Marazopoulou, Dan Garant, David Jensen
Propensity Score Matching for Causal Inference with Relational Data Proceedings Article
In: Proceedings of the UAI 2014 Workshop Causal Inference: Learning and Prediction co-located with 30th Conference on Uncertainty in Artificial Intelligence (UAI 2014), Quebec City, Canada, July 27, 2014, pp. 25–34, CEUR-WS.org, 2014.
@inproceedings{DBLP:conf/uai/ArbourMGJ14,
title = {Propensity Score Matching for Causal Inference with Relational Data},
author = {David Arbour and Katerina Marazopoulou and Dan Garant and David Jensen},
url = {http://ceur-ws.org/Vol-1274/uai2014ci_paper5.pdf},
year = {2014},
date = {2014-01-01},
booktitle = {Proceedings of the UAI 2014 Workshop Causal Inference: Learning
and Prediction co-located with 30th Conference on Uncertainty in Artificial
Intelligence (UAI 2014), Quebec City, Canada, July 27, 2014},
volume = {1274},
pages = {25--34},
publisher = {CEUR-WS.org},
series = {CEUR Workshop Proceedings},
abstract = {Propensity score matching (PSM) is a widely used method for performing causal inference with observational data. PSM requires fully specifying the set of confounding variables of treatment and outcome. In the case of relational data, this set may include non-intuitive relational variables, i.e., variables derived from the relational structure of the data. In this work, we provide an automated method to derive these relational variables based on the relational structure and a set of naive confounders. This automatic construction includes two unusual classes of variables: relational degree and entity identifiers. We provide experimental evidence that demonstrates the utility of these variables in accounting for certain latent confounders. Finally, through a set of synthetic experiments, we show that our method improves the performance of PSM for causal inference with relational data.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Katerina Marazopoulou, David Arbour, David Jensen
Refining the Semantics of Social Influence Miscellaneous
2014.
@misc{DBLP:journals/corr/MarazopoulouAJ14,
title = {Refining the Semantics of Social Influence},
author = {Katerina Marazopoulou and David Arbour and David Jensen},
url = {http://arxiv.org/abs/1412.5238},
year = {2014},
date = {2014-01-01},
journal = {CoRR},
volume = {abs/1412.5238},
abstract = {With the proliferation of network data, researchers are increasingly focusing on questions investigating phenomena occurring on networks. This often includes analysis of peer-effects, i.e., how the connections of an individual affect that individual's behavior. This type of influence is not limited to direct connections of an individual (such as friends), but also to individuals that are connected through longer paths (for example, friends of friends, or friends of friends of friends). In this work, we identify an ambiguity in the definition of what constitutes the extended neighborhood of an individual. This ambiguity gives rise to different semantics and supports different types of underlying phenomena. We present experimental results, both on synthetic and real networks, that quantify differences among the sets of extended neighbors under different semantics. Finally, we provide experimental evidence that demonstrates how the use of different semantics affects model selection.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2013
David Arbour, James Atwood, Ahmed El-Kishky, David Jensen
Agglomerative Clustering of Bagged Data Using Joint Distributions Journal Article
In: 2013.
@article{arbour2013agglomerative,
title = {Agglomerative Clustering of Bagged Data Using Joint Distributions},
author = {David Arbour and James Atwood and Ahmed El-Kishky and David Jensen},
url = {http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.703.7198&rep=rep1&type=pdf},
year = {2013},
date = {2013-01-01},
publisher = {Citeseer},
abstract = {Current methods for hierarchical clustering of data either operate on features of the data or make limiting model assumptions. We present the hierarchy discovery algorithm (HDA), a model-based hierarchical clustering method based on explicit comparison of joint distributions via Bayesian network learning for predefined groups of data. HDA works on both continuous and discrete data and offers a model-based approach to agglomerative clustering that does not require prespecification of the model dependency structure.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Lisa Friedland, David Jensen, Michael Lavine
Copy or Coincidence? A Model for Detecting Social Influence and Duplication Events Proceedings Article
In: Proceedings of the 30th International Conference on Machine Learning, ICML 2013, Atlanta, GA, USA, 16-21 June 2013, pp. 1175–1183, JMLR.org, 2013.
@inproceedings{DBLP:conf/icml/FriedlandJL13,
title = {Copy or Coincidence? A Model for Detecting Social Influence and
Duplication Events},
author = {Lisa Friedland and David Jensen and Michael Lavine},
url = {http://proceedings.mlr.press/v28/friedland13.html},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the 30th International Conference on Machine Learning,
ICML 2013, Atlanta, GA, USA, 16-21 June 2013},
volume = {28},
pages = {1175--1183},
publisher = {JMLR.org},
series = {JMLR Workshop and Conference Proceedings},
abstract = {In this paper, we analyze the task of inferring rare links between pairs of entities that seem too similar to have occurred by chance. Variations of this task appear in such diverse areas as social network analysis, security, fraud detection, and entity resolution. To address the task in a general form, we propose a simple, flexible mixture model in which most entities are generated independently from a distribution but a small number of pairs are constrained to be similar. We predict the true pairs using a likelihood ratio that trades off the entities’ similarity with their rarity. This method always outperforms using only similarity; however, with certain parameter settings, similarity turns out to be surprisingly competitive. Using real data, we apply the model to detect twins given their birth weights and to re-identify cell phone users based on distinctive usage patterns.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Katerina Marazopoulou, David Arbour, David Jensen
A Sound and Complete Algorithm for Learning Causal Models from Relational Data Proceedings Article
In: Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence, UAI 2013, Bellevue, WA, USA, August 11-15, 2013, AUAI Press, 2013.
@inproceedings{DBLP:conf/uai/MaierMAJ13,
title = {A Sound and Complete Algorithm for Learning Causal Models from Relational
Data},
author = {Marc Maier and Katerina Marazopoulou and David Arbour and David Jensen},
url = {https://dslpitt.org/uai/displayArticleDetails.jsp?mmnu=1&smnu=2&article_id=2398&proceeding_id=29},
year = {2013},
date = {2013-01-01},
booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial
Intelligence, UAI 2013, Bellevue, WA, USA, August 11-15, 2013},
publisher = {AUAI Press},
abstract = {In this paper, we analyze the task of inferring rare links between pairs of entities that seem too similar to have occurred by chance. Variations of this task appear in such diverse areas as social network analysis, security, fraud detection, and entity resolution. To address the task in a general form, we propose a simple, flexible mixture model in which most entities are generated independently from a distribution but a small number of pairs are constrained to be similar. We predict the true pairs using a likelihood ratio that trades off the entities’ similarity with their rarity. This method always outperforms using only similarity; however, with certain parameter settings, similarity turns out to be surprisingly competitive. Using real data, we apply the model to detect twins given their birth weights and to re-identify cell phone users based on distinctive usage patterns.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Katerina Marazopoulou, David Arbour, David Jensen
Flattening network data for causal discovery: What could go wrong? Proceedings Article
In: Workshop on Information in Networks, 2013.
@inproceedings{maier2013flattening,
title = {Flattening network data for causal discovery: What could go wrong?},
author = {Marc Maier and Katerina Marazopoulou and David Arbour and David Jensen},
url = {https://www.semanticscholar.org/paper/Flattening-network-data-for-causal-discovery-%3A-What-Maier-Marazopoulou/c327100636c022c259f5e1bf2d7fcbbd0b048935},
year = {2013},
date = {2013-01-01},
booktitle = {Workshop on Information in Networks},
volume = {64},
abstract = {Methods for learning causal dependencies from observational data have been the focus of decades of work in social science, statistics, machine learning, and philosophy [9, 10, 11]. Much of the theoretical and practical work on causal discovery has focused on propositional representations. Propositional models effectively represent individual directed causal dependencies (e.g., path analysis, Bayesian networks) or conditional distributions of some outcome variable (e.g., linear regression, decision trees). However, propositional representations are limited to modeling independent and identically distributed (IID) data of a single entity type. Many real-world systems involve heterogeneous, interacting entities with probabilistic dependencies that cross the boundaries of those entities (i.e., non-IID data with multiple entity types and relationships). These systems produce network, or relational, data, and they are of paramount interest to researchers and practitioners across a wide range of disciplines. To model such data, researchers in statistics and computer science have devised more expressive classes of directed graphical models, such as probabilistic relational models (PRMs) [2] and directed acyclic probabilistic entityrelationship (DAPER) models [4]. Despite the assumptions embedded in propositional models, a common practice is to flatten, or propositionalize, relational data and use existing algorithms [5] (see Figure 1, focusing on algorithms that learn causal graphical models). While there are statistical concerns, this process is generally innocuous if the task is to model statistical associations for predictive inference. In contrast, to learn causal structure, estimate causal effects, or support inference over interventions, the effects of flattening inherently relational data can be particularly deleterious. In this paper, we identify four classes of potential issues that can occur with a propositionalization strategy as opposed to embracing a more expressive representation that would not succumb to these problems. We also present empirical results comparing the effectiveness of two theoretically sound and complete algorithms that learn causal structure: PC—a widely used constraint-based, propositional algorithm for causal discovery [11], and RCD—a recently developed constraint-based algorithm that reasons over a relational representation [6].},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Marc Maier, Katerina Marazopoulou, David Jensen
Reasoning about Independence in Probabilistic Models of Relational Data Miscellaneous
2013.
@misc{DBLP:journals/corr/abs-1302-4381,
title = {Reasoning about Independence in Probabilistic Models of Relational
Data},
author = {Marc Maier and Katerina Marazopoulou and David Jensen},
url = {http://arxiv.org/abs/1302.4381},
year = {2013},
date = {2013-01-01},
journal = {CoRR},
volume = {abs/1302.4381},
abstract = {We extend the theory of d-separation to cases in which data instances are not independent and identically distributed. We show that applying the rules of d-separation directly to the structure of probabilistic models of relational data inaccurately infers conditional independence. We introduce relational d-separation, a theory for deriving conditional independence facts from relational models. We provide a new representation, the abstract ground graph, that enables a sound, complete, and computationally efficient method for answering d-separation queries about relational models, and we present empirical results that demonstrate effectiveness.},
keywords = {},
pubstate = {published},
tppubtype = {misc}
}
2012
Matthew Rattigan
Leveraging Relational Representations for Causal Discovery PhD Thesis
2012, ISBN: 9781267786821, (AAI3545976).
@phdthesis{10.5555/2520420,
title = {Leveraging Relational Representations for Causal Discovery},
author = {Matthew Rattigan},
isbn = {9781267786821},
year = {2012},
date = {2012-01-01},
publisher = {University of Massachusetts Amherst},
abstract = {This thesis represents a synthesis of relational learning and causal discovery, two subjects at the frontier of machine learning research. Relational learning investigates algorithms for constructing statistical models of data drawn from of multiple types of interrelated entities, and causal discovery investigates algorithms for constructing causal models from observational data. My work demonstrates that there exists a natural, methodological synergy between these two areas of study, and that despite the sometimes onerous nature of each, their combination (perhaps counterintuitively) can provide advances in the state of the art for both. Traditionally, propositional (or "flat") data representations have dominated the statistical sciences. These representations assume that data consist of independent and identically distributed (iid) entities which can be represented by a single data table. More recently, data scientists have increasingly focused on "relational" data sets that consist of interrelated, heterogeneous entities. However, relational learning and causal discovery are rarely combined. Relational representations are wholly absent from the literature where causality is discussed explicitly. Instead, the literature on causality that uses the framework of graphical models assumes that data are independent and identically distributed. This unexplored topical intersection represents an opportunity for advancement — by combining relational learning with causal reasoning, we can provide insight into the challenges found in each subject area. By adopting a causal viewpoint, we can clarify the mechanisms that produce previously identified pathologies in relational learning. Analogously, we can utilize relational data to establish and strengthen causal claims in ways that are impossible using only propositional representations.},
note = {AAI3545976},
keywords = {},
pubstate = {published},
tppubtype = {phdthesis}
}
2011
Marc Maier, Matthew Rattigan, David Jensen
Indexing Network Structure with Shortest-Path Trees Journal Article
In: ACM Trans. Knowl. Discov. Data, vol. 5, no. 3, 2011, ISSN: 1556-4681.
@article{10.1145/1993077.1993079,
title = {Indexing Network Structure with Shortest-Path Trees},
author = {Marc Maier and Matthew Rattigan and David Jensen},
url = {https://doi.org/10.1145/1993077.1993079},
doi = {10.1145/1993077.1993079},
issn = {1556-4681},
year = {2011},
date = {2011-08-01},
journal = {ACM Trans. Knowl. Discov. Data},
volume = {5},
number = {3},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
abstract = {The ability to discover low-cost paths in networks has practical consequences for knowledge discovery and social network analysis tasks. Many analytic techniques for networks require finding low-cost paths, but exact methods for search become prohibitive for large networks, and data sets are steadily increasing in size. Short paths can be found efficiently by utilizing an index of network structure, which estimates network distances and enables rapid discovery of short paths. Through experiments on synthetic networks, we demonstrate that one such novel network structure index based on the shortest-path tree outperforms other previously proposed indices. We also show that it generalizes across arbitrarily weighted networks of various structures and densities, provides accurate estimates of distance, and has efficient time and space complexity. We present results on real data sets for several applications, including navigation, diameter estimation, centrality computation, and clustering---all made efficient by virtue of the network structure index.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Matthew Rattigan, Marc Maier, David Jensen
Relational blocking for causal discovery Proceedings Article
In: Proceedings of the Twenty-Fifth AAAI Conference on Artificial Intelligence, 2011.
@inproceedings{rattigan2011relational,
title = {Relational blocking for causal discovery},
author = {Matthew Rattigan and Marc Maier and David Jensen},
url = {http://www.aaai.org/ocs/index.php/AAAI/AAAI11/paper/view/3760},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the Twenty-Fifth AAAI Conference on Artificial Intelligence},
volume = {25},
number = {1},
abstract = {Blocking is a technique commonly used in manual statistical analysis to account for confounding variables. However, blocking is not currently used in automated learning algorithms. These algorithms rely solely on statistical conditioning as an operator to identify conditional independence. In this work, we present relational blocking as a new operator that can be used for learning the structure of causal models. We describe how blocking is enabled by relational data sets, where blocks are determined by the links in the network. By blocking on entities rather than conditioning on variables, relational blocking can account for both measured and unobserved variables. We explain the mechanism of these methods using graphical models and the semantics of d-separation. Finally, we demonstrate the effectiveness of relational blocking for use in causal discovery by showing how blocking can be used in the causal analysis of two real-world social media systems.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Phillip B Kirlin, David Jensen
Probabilistic Modeling of Hierarchical Music Analysis. Proceedings Article
In: Proceedings of the 12th International Society for Music Information Retrieval Conference, ISMIR, pp. 393–398, 2011.
@inproceedings{kirlin2011probabilistic,
title = {Probabilistic Modeling of Hierarchical Music Analysis.},
author = {Phillip B Kirlin and David Jensen},
url = {http://ismir2011.ismir.net/papers/PS3-7.pdf},
year = {2011},
date = {2011-01-01},
booktitle = {Proceedings of the 12th International Society for Music Information
Retrieval Conference, ISMIR},
pages = {393--398},
abstract = {Hierarchical music analysis, as exemplified by Schenkerian analysis, describes the structure of a musical composition by a hierarchy among its notes. Each analysis defines a set of prolongations, where musical objects persist in time even though others are present. We present a formal model for representing hierarchical music analysis, probabilistic interpretations of that model, and an efficient algorithm for computing the most probable analysis under these interpretations. We represent Schenkerian analyses as maximal outerplanar graphs (MOPs). We use this representation to encode the largest known data set of computer-processable Schenkerian analyses, and we use these data to identify statistical regularities in the human-generated analyses. We show that a dynamic programming algorithm can be applied to these regularities to identify the maximum likelihood analysis for a given piece of music.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
Huseyin Oktay, A Soner Balkir, Ian Foster, David Jensen
Distance estimation for very large networks using mapreduce and network structure indices Proceedings Article
In: Workshop on Information Networks, 2011.
@inproceedings{oktay2011distance,
title = {Distance estimation for very large networks using mapreduce and network structure indices},
author = {Huseyin Oktay and A Soner Balkir and Ian Foster and David Jensen},
year = {2011},
date = {2011-01-01},
booktitle = {Workshop on Information Networks},
abstract = {The ability to discover low-cost paths in networks has practical consequences for knowledge discovery and social network analysis tasks. Many analytic techniques for networks require finding low-cost paths, but exact methods for search become prohibitive for large networks, and data sets are steadily increasing in size. Short paths can be found efficiently by utilizing an index of network structure, which estimates network distances and enables rapid discovery of short paths. Through experiments on synthetic networks, we demonstrate that one such novel network structure index based on the shortest-path tree outperforms other previously proposed indices. We also show that it generalizes across arbitrarily weighted networks of various structures and densities, provides accurate estimates of distance, and has efficient time and space complexity. We present results on real data sets for several applications, including navigation, diameter estimation, centrality computation, and clustering---all made efficient by virtue of the network structure index.},
keywords = {},
pubstate = {published},
tppubtype = {inproceedings}
}
2010
Michael Hay, Gerome Miklau, David Jensen
Analyzing private network data Journal Article
In: Privacy-aware knowledge discovery: Novel applications and new techniques, pp. 459–498, 2010.
@article{hay2010analyzing,
title = {Analyzing private network data},
author = {Michael Hay and Gerome Miklau and David Jensen},
year = {2010},
date = {2010-01-01},
journal = {Privacy-aware knowledge discovery: Novel applications and new techniques},
pages = {459--498},
publisher = {Chapman & Hall/CRC},
keywords = {},
pubstate = {published},
tppubtype = {article}
}
Michael Hay, Gerome Miklau, David Jensen, Don Towsley, Chao Li
Resisting structural re-identification in anonymized social networks Journal Article
In: The VLDB Journal, vol. 19, no. 6, pp. 797–823, 2010.
@article{hay2010resisting,
title = {Resisting structural re-identification in anonymized social networks},
author = {Michael Hay and Gerome Miklau and David Jensen and Don Towsley and Chao Li},
url = {https://doi.org/10.1007/s00778-010-0210-x},
year = {2010},
date = {2010-01-01},
journal = {The VLDB Journal},
volume = {19},
number = {6},
pages = {797--823},
publisher = {Springer-Verlag},
abstract = {We identify privacy risks associated with releasing network data sets and provide an algorithm that mitigates those risks. A network consists of entities connected by links representing relations such as friendship, communication, or shared activity. Maintaining privacy when publishing networked data is uniquely challenging because an individual's network context can be used to identify them even if other identifying information is removed. In this paper, we quantify the privacy risks associated with three classes of attacks on the privacy of individuals in networks, based on the knowledge used by the adversary. We show that the risks of these attacks vary greatly based on network structure and size. We propose a novel approach to anonymizing network data that models aggregate network structure and then allows samples to be drawn from that model. The approach guarantees anonymity for network entities while preserving the ability to estimate a wide variety of network measures with relatively little bias.},
keywords = {},
pubstate = {published},
tppubtype = {article}
}