@Unpublished{daume01coordination,
  author =       {Hal {Daum\'e III}},
  title =        {Asymmetry of Coordination},
  note =         {Available at \url{http://www.isi.edu/~hdaume/docs/daume01coordination.ps}},
  month =        {December},
  year =         {2001},
  abstract =     {
    The standard syntactic analysis of coordination gives equal value to
    both conjoined elements, and treats both elements equivalently.
    Nonetheless, in many languages (even English), coordination is much
    more than simply taking two constituents of the same type (or possibly
    not) and putting a conjunction between them, yielding a trinary
    branching node.  In this paper I begin with an analysis of
    coordination in general, present cross-linguistic arguments in its
    favor, and finally discuss how this structure can account for
    otherwise unexplained raising data.
  },
  keywords = {ling},
  tagline = {A final paper for a graduate linguistics syntax class, examines the issue of coordination in English and Japanese and proposes a unified structure for this phenomena. Also reviews several proposals to date.},
  url = {http://pub.hal3.name/#daume01coordination}
}
@InProceedings{daume01iim,
  author =       {Eric Nyberg and Hal {Daum\'e III}},
  title =        {Integrated Information Management: An Interactive, Extensible Architecture for Information Retrieval},
  booktitle =    {Proceedings of the 2001 Human Language Technology Conference (HLT)},
  year =         {2001},
  address =      {San Diego, CA},
  month =        {March 18 -- 21},
  abstract =     {
    Most current IR research is focused on specific technologies, such
    as filtering, classification, entity extraction, question
    answering, etc. There is relatively little research on merging
    multiple technologies into sophisticated applications, due in part
    to the high cost of integrating independently-developed text
    processing modules.  In this paper, we present the Integrated
    Information Management (IIM) architecture for component-based
    development of IR applications.  The IIM architecture is general
    enough to model different types of IR tasks, beyond indexing and
    retrieval.
  },
  keywords = {nlp},
  tagline = {We present the Integrated Information Management (IIM) architecture for component-based development of IR applications, which is general enough to model different types of IR tasks, beyond indexing and retrieval.},
  url = {http://pub.hal3.name/#daume-iim}
}
@InProceedings{daume02gleans,
  author =       {Hal {Daum\'e III} and Abdesammad Echihabi and Daniel Marcu and Dragos Stefan Munteanu and Radu Soricut},
  title =        {{GLEANS}: A Generator of Logical Extracts and Abstracts for Nice Summaries},
  booktitle =    {Proceedings of the Second Document Understanding Conference (DUC)},
  year =         {2002},
  address =      {Philadelphia, PA},
  month =        {July 11 -- 12},
  pages =        {9 - 14},
  abstract =     {
    We briefly describe GLEANS, a summarization system that uses four
    novel techniques for summarizing document collections.  (i) GLEANS
    first maps all documents in a collection into a canonical,
    database-like representation that makes explicit the main entities
    and relations in a document collection.  (ii) GLEANS also
    classifies each document collection into one of four categories:
    collections about a single person, single events, multiple events,
    and natural disasters.  (iii) For each type of document
    collection, GLEANS also generates from scratch, using predefined
    templates, the first two sentences in the abstract.  (iv) The rest
    of the summary is then generated by extracting from the database
    sentences that conform to a set of predefined schemas and by
    presenting them in an order that reflects coherence constraints
    specific to each collection category.
  },
  keywords = {nlp},
  tagline = {We describe a summarization system that functions by mapping documents into a canonical database-like representation, categorizes the documents, and generates abstracts and extracts based on generic templates.},
  url = {http://pub.hal3.name/#daume-gleans}
}
@InProceedings{daume02lexicalized,
  author =       {Hal {Daum\'e III} and Kevin Knight and Irene {Langkilde-Geary} and Daniel Marcu and Kenji Yamada},
  title =        {The Importance of Lexicalized Syntax Models for Natural Language Generation Tasks},
  booktitle =    {Proceedings of the 2002 International Conference on Natural Language Generation (INLG)},
  year =         {2002},
  address =      {Harriman, NY},
  month =        {July 1 -- 3},
  pages =        {9 - 16},
  abstract =     {
    The parsing community has long recognized the importance of
    lexicalized models of syntax.  By contrast, these models do not
    appear to have had an impact on the statistical NLG community.  To
    prove their importance in NLG, we show that a lexicalized model of
    syntax improves the performance of a statistical text compression
    system, and show results that suggest it would also improve the
    performances of an MT application and a pure natural language
    generation system.
  },
  keywords = {nlp},
  tagline = {We compare the performance difference of n-gram language models and syntax-based langauge models on pure natural language generation, summarization and machine translation. Our results show that lexicalized syntax-based models greatly improve readability.},
  url = {http://pub.hal3.name/#daume-lexicalized}
}
@InProceedings{daume02noisy,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {A Noisy-Channel Model for Document Compression},
  booktitle =    {Proceedings of the 40th Annual Meeting of the Association for Computational Linguistics (ACL)},
  year =         {2002},
  month =        {July 6 -- 12},
  address =      {Philadelphia, PA},
  pages =        {449 - 456},
  abstract =     {
    We present a document compression system that uses a hierarchical
    noisy-channel model of text production.  Our compression system
    first automatically derives the syntactic structure of each
    sentence and the overall discourse structure of the text given as
    input.  The system then uses a statistical hierarchical model of
    text production in order to drop non-important syntactic and
    discourse constituents so as to generate coherent, grammatical
    document compressions of arbitrary length.  The system outperforms
    both a baseline and a sentence-based compression system that
    operates by simplifying sequentially all sentences in a text.  Our
    results support the claim that discourse knowledge plays an
    important role in document summarization.
  },
  keywords = {nlp},
  tagline = {We present a document compression system that uses a hierarchical noisy-channel model of text production based on combining discourse trees and syntax trees. We apply this system to summarization.},
  url = {http://pub.hal3.name/#daume02noisy}
}
@Unpublished{daume02pbhmm,
  author =       {Hal {Daum\'e III}},
  title =        {A Phrase-Based {HMM}},
  note =         {Available at \url{http://www.isi.edu/~hdaume/docs/daume02pbhmm.ps}},
  month =     {December},
  keywords = {nlp},
  tagline = {This describes the derivation of the inference algorithms used in the EMNLP 2004 document/summary alignment paper above. Unfortunately, there was no room in that document to explain these in detail, so they are relegated to this unpublished version.},
  year =      {2002}
}
@Unpublished{daume02yaht,
  author =       {Hal {Daum\'e III}},
  title =        {Yet Another Haskell Tutorial},
  note =         {Available at \url{http://pub.hal3.name#daume02yaht/}},
  keywords = {haskell},
  tagline = {A tutorial on the Haskell programming language, designed for people who have experience programming (just not in a functional language).},
  year =         {2002}
}
@Unpublished{daume04abffs,
  author =       {Hal {Daum\'e III}},
  title =        {Carefully Approximated {Bayes} Factors for Feature Selection in MaxEnt Models},
  note =         {Available at \url{http://www.isi.edu/~hdaume/docs/daume04abffs.ps}},
  month =        {November},
  keywords = {ml},
  tagline = {I describe a method of feature selection in maximum entropy (logistic regression) models based on the Bayes factor. I approximate data evidence using the Laplace approximation, and using two reasonable assumptions manage to get a very efficient procedure that has many benefits, including automatic stopping.},
  year =         {2004}
}
@InProceedings{daume04bracketing,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {NP Bracketing by Maximum Entropy Tagging and {SVM} Reranking},
  booktitle =    {Empirical Methods in Natural Language Processing},
  year =         {2004},
  address =      {Barcelona, Spain},
  abstract =     {
    We perform Noun Phrase Bracketing by using a local, maximum
    entropy-based tagging model, which produces bracketing hypotheses.
    These hypotheses are subsequently fed into a reranking framework based
    on support vector machines.  We solve the problem of hierarchical
    structure in our tagging model by modeling underspecified tags, which
    are fully determined only at decoding time.  The tagging model
    performs comparably to competing approaches and the subsequent
    reranking increases our system's performance from an f-score of $81.7$
    to $86.1$, surpassing the best reported results to date of $83.8$.
  },
  keywords = {nlp sp},
  tagline = {NP Bracketing is the task of identifying all base- and embedded-NPs. We do this task first by ME tagging using an underspecified tagset, and then SVM-based reranking of hypotheses. We get best performance to date, and offer advantages over full parsers.},
  url = {http://pub.hal3.name/#daume04bracketing}
}

@Unpublished{daume04cg-bfgs,
  author =       {Hal {Daum\'e III}},
  title =        {Notes on {CG} and {LM-BFGS} Optimization of Logistic Regression},
  note =         {Paper available at \url{http://pub.hal3.name#daume04cg-bfgs}, implementation available at \url{http://hal3.name/megam/}},
  month =     {August},
  keywords = {ml},
  tagline = {These are notes on the <a href="megam/">implementation</a> I have written of conjugate gradient and limited memory BFGS optimization for logistic regression (aka maximum entropy) classifiers. The notes were created because it is actually quite difficult to find good references on efficient implementation of these algorithms, though discussion of them exists everywhere.},
  year =      {2004}
}
@InProceedings{daume04fusion,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Generic Sentence Fusion is an Ill-Defined Summarization Task},
  booktitle =    {Proceedings of the Text Summarization Branches Out Workshop at ACL (TextSum)},
  year =         {2004},
  address =      {Barcelona, Spain},
  month =        {July 25 -- 26},
  abstract =     {
    We report on a series of human evaluations of the task of sentence
    fusion.  In this task, a human is given two sentences and asked to
    produce a single coherent sentence that contains only the
    \emph{important} information from the original two.  Thus, this is a
    highly constrained summarization task.  Our investigations show that
    even at this restricted level, there is no measurable agreement
    between humans regarding what information should be considered
    important.  We further investigate the ability of separate
    evaluators to assess summaries, and find similarly disturbing lack
    of agreement.
  },
  keywords = {nlp},
  tagline = {We perform a series of human experiments to assess the legitimacy of the task of fusing two sentences without context or task. We find that there is no agreement between humans on this task, even in the limited case of 2 to 1 sentence fusion.},
  url = {http://pub.hal3.name/#daume04fusion}
}
@InProceedings{daume04intents,
  author =       {Hal {Daum\'e III} and Eric Brill},
  title =        {Web Search Intent Induction via Automatic Query Reformulation},
  booktitle =    {North American Chapter of the Association for Computational Linguistics (NAACL)},
  year =         {2004},
  address =      {Boston, MA},
  abstract =     {
    We present a computationally efficient method for automatic grouping
    of web search results based on reformulating the original query to
    alternative queries the user may have intended.  The method requires
    no data other than query logs and the standard inverted indices used
    by most search engines.  Our method outperforms standard web search in
    the task of enabling users to quickly find relevant documents for
    informational queries.
  },
  keywords = {nlp},
  tagline = {We describe an algorithm that uses queries made by other people to help nail down intents for underspecified queries, and show its effectiveness in a real system. This is a short version of an unpublished paper available in <a href="docs/daume04intents-long.ps">Postscript</a> or <a href="docs/daume04intents-long.pdf">PDF</a> format.},
  url = {http://pub.hal3.name/#daume04intents}
}
@Misc{daume04mani,
  author =    {Hal {Daum\'e III}},
  title =     {Book Review: Automatic Summarization (I. Mani)},
  year =      {2004},
  howpublished = {Machine Translation Journal}
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume04mani}
}
@InProceedings{daume04pbhmm,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {A Phrase-Based {HMM} Approach to Document/Abstract Alignment},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2004},
  address =      {Barcelona, Spain},
  abstract =     {
    We describe a model for creating word-to-word and phrase-to-phrase
    alignments between documents and their human written abstracts.  Such
    alignments are critical for the development of statistical
    summarization systems that can be trained on large corpora of
    document/abstract pairs.  Our model, which is based on a novel
    Phrase-Based HMM, outperforms both the Cut \& Paste alignment model
    \cite{jing:cl} and models developed in the context of machine
    translation \cite{brownetal93}.
  },
  keywords = {nlp},
  tagline = {We present a model called the phrase-based HMM (see under unpublished work below) for producing word-to-word and phrase-to-phrase alignments between documents and abstracts. This is in some sense my seminal accomplishment in summarization, and one of the pieces of work I am currently most proud of. I have also made available the <a href="HandAlign/">annotation software and guide</a>. The slides from an extended version of the talk are available in <a href="talks/04-pbhmm/04-pbhmm-extended.sxi">OpenOffice</a> format, or for view <a href="talks/04-pbhmm/04-pbhmm-extended.html">in your browser</a> (warning: animations don't work in the HTML version, which makes some slides impossible to read).},
  url = {http://pub.hal3.name/#daume04pbhmm}
}
@Unpublished{daume04rkhs,
  author =       {Hal {Daum\'e III}},
  title =        {From Zero to Reproducing Kernel Hilbert Spaces in Twelve Pages or Less},
  note =         {Available at \url{http://www.isi.edu/~hdaume/docs/daume04rkhs.ps}},
  month =     {February},
  keywords = {ml},
  tagline = {This is a tutorial on the mathematics behind RKHSs and their use in machine learning (eg., support vector machines and gaussian processes).},
  year =      {2004}
}
@InProceedings{daume04scm,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Supervised clustering with the Dirichlet process},
  booktitle =    {NeurIPS Workshop on Learning With Structured Outputs (LwSO)},
  year =         {2004},
  address =      {Whistler, Canada},
  abstract =     {
    The task of learning to partition data into similar sets occurs
    frequently in many disciplines.  We construct a Bayesian model for
    learning to partition from labeled data.  Our model is based on the
    nonparametric Dirichlet process prior.  Experimental results show that
    our model is able to outperform existing solutions on real world
    datasets.
  },
  keywords = {ml bayes},
  tagline = {In problems such as reference matching, identity uncertainty and coreference resolution, you need to be able to learn to partition sets based on supervised examples. We present a Bayesian model for learning how to do this based on the semi-parametric Dirichlet process prior. A more complete version of this paper is in preparation.},
  url = {http://pub.hal3.name/#daume04scm}
}
@InProceedings{daume04treeposition,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {A Tree-Position Kernel for Document Compression},
  booktitle =    {Proceedings of the Fourth Document Understanding Conference (DUC)},
  year =         {2004},
  address =      {Boston, MA},
  month =        {May 6 -- 7},
  abstract =     {
    We describe our entry into the DUC 2004 automatic document
    summarization competition.  We competed only in the single document,
    headline generation task.  Our system is based on a novel kernel
    dubbed the tree position kernel, combined with two other well-known
    kernels.  Our system performs well on white-box evaluations, but does
    very poorly in the overall DUC evaluation.  However, the latter
    results are offset by the fact that baseline systems consistently
    outperform well engineered systems.
  },
  keywords = {nlp ml},
  tagline = {We introduct the Tree-Position kernel for performing document compression, in a model which is a generalization of the document compression model from ACL 2002 (below). Works well by itself, but not in a full system. The slides from the talk are available as <a href="docs/04-tree-position-kernel.ps.bz2">Postscript</a> or <a href="docs/04-tree-position-kernel.pdf">PDF</a>.},
  url = {http://pub.hal3.name/#daume04treeposition}
}
@Article{daume05alignments,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Induction of Word and Phrase Alignments for Automatic Document Summarization},
  journal =      {Computational Linguistics (CL)},
  year =         {2005},
  month =        {December},
  volume =       {31},
  number =       {4},
  pages =        {505--530},
  abstract =     {
    Current research in automatic single document summarization is
    dominated by two effective, yet na\"ive approaches: summarization by
    sentence extraction, and headline generation via bag-of-words models.
    While successful in some tasks, neither of these models is able to
    adequately capture the large set of linguistic devices utilized by
    humans when they produce summaries.  One possible explanation for the
    widespread use of these models is that good techniques have been
    developed to extract appropriate training data for them from existing
    document/abstract and document/headline corpora.  We believe that
    future progress in automatic summarization will be driven both by the
    development of more sophisticated, linguistically informed models, as
    well as a more effective leveraging of document/abstract corpora.  In
    order to open the doors to simultaneously achieving both of these
    goals, we have developed techniques for automatically producing
    word-to-word and phrase-to-phrase \emph{alignments} between documents
    and their human-written abstracts.  These alignments make explicit the
    correspondences that exist in such document/abstract pairs, and create
    a potentially rich data source from which complex summarization
    algorithms may learn.  This paper describes experiments we have
    carried out to analyze the ability of \emph{humans} to perform such
    alignments, and based on these analyses, we describe experiments for
    creating them automatically.  Our model for the alignment task is
    based on an extension of the standard hidden Markov model, and learns
    to create alignments in a completely unsupervised fashion.  We
    describe our model in detail and present experimental results that
    show that our model is able to learn to reliably identify word- and
    phrase-level alignments in a corpus of \docabs\ pairs.
  },
  keywords = {nlp sum},
  tagline = {We develop a model for automatically building word and phrase alignments for the task of automatic document summarization, based on the segment HMM (or semi-HMM). This explicates and extends the EMNLP paper below, with more experiments and some syntactically motived transition models.},
  url = {http://pub.hal3.name/#daume05alignments}
}
@InProceedings{daume05coref,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {A Large-Scale Exploration of Effective Global Features for a Joint Entity Detection and Tracking Model},
  booktitle =    {Joint Conference on Human Language Technology and Empirical Methods in Natural Language Processing (HLT/EMNLP)},
  year =         {2005},
  address =      {Vancouver, Canada},
  abstract =     {
    Entity detection and tracking (EDT) is the task of identifying textual
    mentions of real-world entities in documents, extending the named
    entity detection and coreference resolution task by considering
    mentions other than names (pronouns, definite descriptions, etc.).
    Like NE tagging and coreference resolution, most solutions to the EDT
    task separate out the mention detection aspect from the coreference
    aspect.  By doing so, these solutions are limited to using only local
    features for learning.  In contrast, by modeling both aspects of the
    EDT task simultaneously, we are able to learn using highly complex,
    non-local features.  We develop a new joint EDT model and explore the
    utility of many features, demonstrating their effectiveness on this
    task.
  },
  keywords = {nlp},
  tagline = {We apply the <a href="#daume05laso">LaSO</a> technique to the EDT problem (entity identification and coreference); doing so allows us to use many more interesting features than have been previously available in models for this task, and we get very good results on the ACE benchmark data (with very efficient algorithms).},
  url = {http://pub.hal3.name/#daume05coref}
}

@Article{daume05dpscm,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {A {B}ayesian Model for Supervised Clustering with the {D}irichlet Process Prior},
  journal =      {Journal of Machine Learning Research (JMLR)},
  year =         {2005},
  month =        {September},
  volume =       {6},
  pages =        {1551--1577},
  abstract =     {
    We develop a Bayesian framework for tackling the supervised clustering
    problem, the generic problem encountered in problems such as reference
    matching, coreference resolution, identity uncertainty and record
    linkage.  Our clustering model is based on the non-parametric
    Dirichlet process prior, which enables us to define distributions over
    the countably infinite sets that naturally arise in this problem.  We
    add \emph{supervision} to our model by positing the existence of a set
    of unobserved random variables (we call these ``reference types'')
    that are generic across all clusters.  Inference in our framework,
    which require integrating over infinitely many parameters, is solved
    using Markov chain Monte Carlo techniques.  We present algorithms for
    both conjugate and non-conjugate priors.  We present a simple -- but
    general -- parameterization of our model based on a Gaussian
    assumption.  We evaluate this model on one artificial task and three
    real-world tasks, comparing it against both unsupervised and
    state-of-the-art supervised algorithms.  Our results show that our
    model is able to outperform other models for this task across a
    variety of performance metrics.
  },
  keywords = {ml bayes},
  tagline = {We develop three models for the supervised clustering problem based on the Dirichlet process as a prior distribution over the potentially infinite sets encountered in this problem. On three of four data sets, we achieve quite impressive results. This extends and completes the NeurIPS workshop paper below.},
  url = {http://pub.hal3.name/#daume05dpscm}
}
@InProceedings{daume05duc,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Bayesian Summarization at DUC and a Suggestion for Extrinsic Evaluation},
  booktitle =    {Proceedings of the Document Understanding Conference (DUC)},
  year =         {2005},
  address =      {Vancouver, B.C., Canada},
  month =        {October 9--10},
  abstract =     {
    We describe our entry into the Document Understanding Conference
    competition for evaluating query-focused multi-document summarization
    systems.  Our system is based on a Bayesian Query-Focused
    Summarization model, similar to the system we entered into the MSE
    competition.  This paper begins by describing the (few) differences
    between our DUC system and our MSE system and describes our placement
    in the competition.  The remainder of this paper argues in favor of
    performing \emph{extrinsic} evaluation of summarization systems, and
    suggests a method for doing so.
  },
  keywords = {nlp sum},
  tagline = {Since we submitted essentially the same system to DUC as to MSE, we briefly describe the minor differences, then spend the rest of the paper advocating a new evaluation technique, called Filtering by Summary. This is an extrinsic evaluation that is easy to deploy, requires little training, and isn't costly.},
  url = {http://pub.hal3.name/#daume05duc}
}
@InProceedings{daume05laso,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Learning as Search Optimization: Approximate Large Margin Methods for Structured Prediction},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2005},
  address =      {Bonn, Germany},
  errata = {There are some technical errors in this paper; see On Learning Linear Ranking Functions for Beam Search by Xu and Fern, ICML 2007, at http://web.engr.oregonstate.edu/~afern/papers/beam-icml07.pdf for more details.},
  abstract =     {
    Mappings to structured output spaces (strings, trees, partitions,
    etc.) are typically learned using extensions of classification
    algorithms to simple graphical structures (eg., linear chains) in
    which search and parameter estimation can be performed exactly.
    Unfortunately, in many complex problems, it is rare that exact search
    or parameter estimation is tractable.  Instead of learning exact
    models and searching via heuristic means, we embrace this difficulty
    and treat the structured output problem in terms of approximate
    search.  We present a framework for learning as search optimization,
    and two parameter updates with convergence theorems and bounds.
    Empirical evidence shows that our integrated approach to learning and
    decoding can outperform exact models at smaller computational cost.
  },
  keywords = {ml sp},
  tagline = {We describe a machine learning framework for producing structured outputs (such as sequences, parse trees, sentences, etc.) when search is intractable. The framework is based on combining online updates with a generic search algorithm, and leads to very efficient, very effective models. Slides in <a href="docs/daume05laso-slides.odp">OpenOffice</a> or <a href="docs/daume05laso-slides.pdf">PDF</a> format.},
  url = {http://pub.hal3.name/#daume05laso}
}

@InProceedings{daume05mse,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Bayesian Multi-Document Summarization at MSE},
  booktitle =    {Proceedings of the Workshop on Multilingual Summarization Evaluation (MSE)},
  year =         {2005},
  address =      {Ann Arbor, MI},
  month =        {June 29},
  abstract =     {
    We describe our entry into the Multilingual Summarization Evaluation
    (MSE) competition for evaluating generic multi-document summarization
    systems, where documents are drawn both from English data and English
    translations of Arabic data.  Our system is based on a Bayesian
    Query-Focused Summarization model, adapted to the generic,
    multi-document setting and tuned against the \textsc{Rouge} evaluation
    metric.  In the human pyramid-based evaluation, our system scored an
    average of $0.530$, approximately $8\%$ better than the next best
    system, which scored $0.489$.  In the automatic evaluation, our system
    scored $0.157$ (behind four other sites) with the skip-bigram
    evaluation, and $0.131$ (behind two other sites) with the standard
    bigram evaluation.
  },
  keywords = {nlp sum},
  tagline = {We describe our entry into the MSE competition; it is based on our Bayesian Query-Focused Summarization model (under review), adapted to the multidocument setting. The model performed very well in the evaluation, coming in first according to the human evaluation and fifth or third according to the two automatic evaluations.},
  url = {http://pub.hal3.name/#daume05mse}
}
@InProceedings{daume05search,
  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
  title =        {Search-Based Structured Prediction as Classification},
  booktitle =    {NeurIPS Workshop on Advances in Structured Learning for Text and Speech Processing (ASLTSP)},
  year =         {2005},
  address =      {Whistler, Canada},
  abstract =     {
    Solutions to computationally hard problems often require that
    search be used.  Integrating search into the learning phase has been
    previously proposed in an ad-hoc manner (Daume & Marcu, 2005).  In
    this paper, we show that structured prediction can be mapped into a
    search setting using language from reinforcement learning, and known
    techniques for reinforcement learning (Langford et al., 2005) can give
    formal performance bounds on the structured prediction task.
  },
  keywords = {ml sp},
  tagline = {We extend and formalize the LaSO framework developed in the ICML 2005 paper and obtain significantly stronger theoretical guarantees, even for non-0/1 loss. Relationships to reinforcement learning are also discussed.},
  url = {http://pub.hal3.name/#daume05search}
}

@InProceedings{daume06bqfs,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Bayesian Query-Focused Summarization},
  booktitle =    {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year =         {2006},
  address =      {Sydney, Australia},
  abstract =     {
    We present BayeSum (for ``Bayesian summarization''), a model for
    sentence extraction in query-focused summarization.  BayeSum
    leverages the common case in which multiple documents are relevant
    to a single query.  Using these documents as reinforcement for
    query terms, BayeSum is not afflicted by the paucity of
    information in short queries.  We show that approximate inference
    in BayeSum is possible on large data sets and results in a
    state-of-the-art summarization system.  Furthermore, we show how
    BayeSum can be understood as a justified query expansion technique
    in the language modeling for IR framework.
  },
  keywords = {nlp bayes sum},
  tagline = {We describe the BayeSum system (previously called BQFS in our DUC and MSE papers) for query-focused sentence extraction in a Bayesian framework (the model looks like a topic model if you squint a little). Achieves competitive performance on white-box experiments and also leads to good systems in DUC. Also, from an IR perspective, provides a formalism for statistically grounded query expansion. Slides available as <a href="http://pub.hal3.name/daume06bqfs.odp">OpenOffice</a> and <a href="http://pub.hal3.name/daume06bqfs.odp.pdf">PDF</a>.},
  url = {http://pub.hal3.name/#daume06bqfs}
}

@article{daume06megam,
  author =       {Hal {Daum\'e III} and Daniel Marcu},
  title =        {Domain Adaptation for Statistical Classifiers},
  journal =      {Journal of Artificial Intelligence Research (JAIR)},
  year =         {2006},
  volume =       {26},
  pages =        {101--126},
  abstract = {
    The most basic assumption used in statistical learning theory is that
    training data and test data are drawn from the same underlying
    distribution.  Unfortunately, in many applications, the ``in-domain''
    test data is drawn from a distribution that is related, but not
    identical, to the ``out-of-domain'' distribution of the training data.
    We consider the common case in which labeled out-of-domain data is
    plentiful, but labeled in-domain data is scarce.  We introduce a
    statistical formulation of this problem in terms of a simple mixture
    model and present an instantiation of this framework to maximum
    entropy classifiers and their linear chain counterparts.  We present
    efficient inference algorithms for this special case based on the
    technique of conditional expectation maximization.  Our experimental
    results show that our approach leads to improved performance on three
    real world tasks on four different data sets from the natural language
    processing domain.
  },
  keywords = {nlp ml da},
  tagline = {We address the problem of moving classifiers from one domain to another (compromising the second &quot;i&quot; in i.i.d.) by treating the data distribution as a mixture of three sources: in-domain, out-of-domain and general. Inference is based on conditional EM for conditional models (maximum entropy) and we get quite good performance on three natural language tasks. (NOTE: There are errors in some of the equation derivations; I will put out an official errata soon!)},
  url =          {http://pub.hal3.name/#daume06megam}
}

@unpublished{daume06searn-practice,
  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
  title =        {Searn in Practice},
  year =         {2006},
  abstract = {
    We recently introduced an algorithm, Searn, for solving hard
    structured prediction problems.  This algorithm
    enjoys many nice properties: efficiency, wide applicability,
    theoretical justification and simplicity.  However, under a desire to
    fit a lot of information into the original paper,
    it may not be so clear how simple the technique is.  This report is
    designed to showcase how Searn can be applied to a wide variety of
    techniques and what really goes on behind the scenes.  We will
    make use of three example problems, ranging from simple to complex.
    These are: (1) sequence labeling, (2) parsing and (3) machine
    translation.  (These were chosen to be as widely understandable,
    especially in the NLP community, as possible.)  In the end, we will
    come back to discuss Searn for general problems.
  },
  keywords = {nlp ml sp},
  tagline = {We recently introduced an algorithm, Searn, for solving hard structured prediction problems. This report is designed to showcase how Searn can be applied to a wide variety of techniques and what really goes on behind the scenes. We show how to apply Searn to three common NLP problems: (1) sequence labeling, (2) parsing and (3) machine translation.},
  url = {http://pub.hal3.name/#daume06searn-practice}
}

@article{daume09searn,
  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
  title =        {Search-based Structured Prediction},
  year =         {2009},
  booktitle =    {Machine Learning Journal (MLJ)},
  abstract =     {
    We present Searn, an algorithm for integrating search and
    learning to solve complex structured prediction problems such
    as those that occur in natural language, speech, computational
    biology, and vision.  Searn is a meta-algorithm that transforms
    these complex problems into simple classification problems to which
    any binary classifier may be applied.  Unlike current algorithms for
    structured learning that require decomposition of both the loss
    function and the feature functions over the predicted structure,
    Searn is able to learn prediction functions for any loss
    function and any class of features.  Moreover, Searn comes
    with a strong, natural theoretical guarantee: good performance on the
    derived classification problems implies good performance on the
    structured prediction problem.
  },
  url = {http://pub.hal3.name/#daume06searn}
}

@PhdThesis{daume06thesis,
  author =       {Hal {Daum\'e III}},
  title =        {Practical Structured Learning Techniques for Natural Language Processing},
  school =       {University of Southern California},
  year =         {2006},
  address =      {Los Angeles, CA},
  month =        {August},
  keywords = {sp nlp ml},
  tagline = {<i>Committee:</i> <a href="http://www.isi.edu/~marcu/"><font color="8866DD">D. Marcu</font></a>, <a href="http://www.isi.edu/~knight/"><font color="8866DD">K. Knight</font></a>, <a href="http://www.isi.edu/~hovy/"><font color="8866DD">E. Hovy</font></a>, <a href="http://www-clmc.usc.edu/~sschaal/"><font color="8866DD">S. Schaal</font></a>, <a href="http://www-rcf.usc.edu/~gareth/"><font color="8866DD">G. James</font></a>, <a href="http://www.cs.umass.edu/~mccallum/"><font color="8866DD">A. McCallum</font></a>.<br> This thesis describes an algorithm for solving many of the complex prediction problems encountered in NLP applications. The algorithm comes with strong theoretical guarentees, is empirically effective in applications such as IE and summarization, is efficient and is easy to implement.},
  url =          {http://pub.hal3.name/#daume06thesis}
}

@InProceedings{daume07astar-dp,
  author =       {Hal {Daum\'e III}},
  title =        {Fast search for Dirichlet process mixture models},
  booktitle =    {Proceedings of the Eleventh International Conference on Artificial Intelligence and Statistics (AIStats)},
  year =         {2007},
  address =      {San Juan, Puerto Rico},
  abstract =     {
    Dirichlet process (DP) mixture models provide a flexible Bayesian
    framework for density estimation.  Unfortunately, their flexibility
    comes at a cost: inference in DP mixture models is computationally
    expensive, even when conjugate distributions are used.  In the common
    case when one seeks only a maximum a posteriori assignment of data
    points to clusters, we show that search algorithms provide a practical
    alternative to expensive MCMC and variational techniques.  When a true
    posterior sample is desired, the solution found by search can serve as
    a good initializer for MCMC.  Experimental results show that using
    these techniques is it possible to apply DP mixture models to very
    large data sets.
  },
  keywords = {ml bayes},
  tagline = {When using DPs for mixture modeling, one often only cares about getting the MAP cluster assignment. In such cases, we show that it is a good idea just to do an efficient search, rather than performing (expensive) sampling. A surprising result emerges: nearly greedy search actually works incredibly well! Slides available as <a href="http://pub.hal3.name/daume07astar-dp.odp">OpenOffice</a> and <a href="http://pub.hal3.name/daume07astar-dp.odp.pdf">PDF</a>.},
  url = {http://pub.hal3.name/#daume07astar-dp}
}
@InProceedings{daume07coalescent,
  author =       {Yee Whye Teh and Hal {Daum\'e III} and Daniel Roy},
  title =        {Bayesian Agglomerative Clustering with Coalescents},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2007},
  address =      {Vancouver, Canada},
  abstract =     {
    We introduce a new Bayesian model for hierarchical clustering based on
    a prior over trees called Kingman's coalescent.  We develop novel
    greedy and sequential Monte Carlo inferences which operate in a
    bottom-up agglomerative fashion.  We show experimentally the
    superiority of our algorithms over others, and demonstrate our
    approach in document clustering and phylolinguistics.
  },
  keywords = {ml bayes},
  tagline = {We introduce a new Bayesian model for hierarchical clustering based on a prior over trees called Kingman's coalescent. We develop novel greedy and sequential Monte Carlo inferences which operate in a bottom-up agglomerative fashion. We show experimentally the superiority of our algorithms over others, and demonstrate our approach in document clustering and phylolinguistics.},
  url = {http://pub.hal3.name/#daume07coalescent}
}
@InProceedings{daume07easyadapt,
  author =       {Hal {Daum\'e III}},
  title =        {Frustratingly Easy Domain Adaptation},
  booktitle =    {Conference of the Association for Computational Linguistics (ACL)},
  year =         {2007},
  address =      {Prague, Czech Republic},
  abstract =     {
    We describe an approach to domain adaptation that is appropriate
    exactly in the case when one has enough ``target'' data to do slightly
    better than just using only ``source'' data.  Our approach is
    incredibly simple, easy to implement as a preprocessing step (10 lines
    of Perl!) and outperforms state-of-the-art approaches on a range of
    datasets.  The technique comes with several simple theoretical
    guarantees.  Moreover, it is trivially extended to a multi-domain
    adaptation problem, where one has data from a variety of different
    domains.
  },
  keywords = {nlp ml da},
  award = {Test of Time Award Nomination (2017)},
  tagline = {When both source and target labeled data are available, a very simple "merge" can lead to state-of-the-art results on the target task. This is essentially a reduction for domain adaptation and can be implimented in <a href="http://hal3.name/easyadapt.txt">10 lines of Perl</a>. Slides available as <a href="http://pub.hal3.name/daume07easyadapt.odp">OpenOffice</a> and <a href="http://pub.hal3.name/daume07easyadapt.odp.pdf">PDF</a>.}
  url = {http://pub.hal3.name/#daume07easyadapt},
}

@InProceedings{daume07implication,
  author =       {Hal {Daum\'e III} and Lyle Campbell},
  title =        {A {B}ayesian Model for Discovering Typological Implications},
  booktitle =    {Conference of the Association for Computational Linguistics (ACL)},
  year =         {2007},
  address =      {Prague, Czech Republic},
  abstract =     {
    A standard form of analysis for linguistic typology is the universal
    implication.  These implications state facts about the range of extant
    languages, such as ``if objects come after verbs, then adjectives come
    after nouns.''  Such implications are typically discovered by
    painstaking hand analysis over a small sample of languages.  We
    propose a computational model for assisting at this process.  Our
    model is able to discover both well-known implications as well as some
    novel implications that deserve further study.  Moreover, through a
    careful application of hierarchical analysis, we are able to cope with
    the well-known sampling problem: languages are not independent.
  },
  keywords = {nlp bayes ling},
  tagline = {Based on the WALS data, we automatically discover typological implications of the form "verb-object implies noun-adjective." Many of our discovered implications are well known; some are not. We introduce a hierarchical prior to cope with the sampling problem. Slides available as <a href="http://pub.hal3.name/daume07implication.odp">OpenOffice</a> and <a href="http://pub.hal3.name/daume07implication.odp.pdf">PDF</a>.},
  url = {http://pub.hal3.name/#daume07implication}
}

@InProceedings{daume08coherence,
  author =       {Devyani Ghosh and John Carter and Hal {Daum\'e III}},
  title =        {Perceptron-based Coherence Predictors},
  booktitle =    {Proceedings of the 2nd Workshop on Chip Multiprocessor Memory Systems and Interconnects (ICSA)},
  year =         {2008},
  address =      {Beijing, China},
  abstract =     {
    Coherence misses in shared-memory multiprocessors account for a
    substantial fraction of execution time in many important
    workloads. Just as branch predictors reduce the performance impact
    of branches, coherence predictors can reduce the performance
    impact of coherence misses.  Two-level pattern-based coherence
    predictors have offered a general prediction method to trigger
    appropriate coherence actions. This paper presents the design and
    evaluation of a perceptron-based coherence predictor that extends
    a conventional directory-based write-invalidate protocol to
    predict when to push updates to remote nodes. When predicted
    correctly, the update eliminates a coherence miss on the remote
    node. We also present a simple mechanism for predicting to which
    nodes we should push updates.  We evaluate our perceptron-based
    update predictor on a variety of SPLASH-2 and PARSEC
    benchmarks. Simulation indicates that the update predictor
    eliminates an average of 30\% of coherence misses. Our simple
    consumer prediction mechanism sent very few useless updates of
    updates were consumed (eliminated misses).
  },
  keywords = {ml},
  tagline = {We present a learning solution to the coherence problem in multicore systems, achieve very high precision and recall on the "will you need this data" prediction task.},
  url = {http://pub.hal3.name/#daume08coherence}
}

@InProceedings{daume08flat,
  author =       {Percy Liang and Hal {Daum\'e III} and Dan Klein},
  title =        {Structure Compilation: Trading Structure for Features},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2008},
  address =      {Helsinki, Finland},
  abstract =     {
    Structured models often achieve excellent performance but can be
    slow at test time.  We investigate structure compilation, where we
    replace structure with features, which are often computationally
    simpler but unfortunately statistically more complex.  We analyze this
    tradeoff theoretically and empirically on three natural language
    processing tasks.  We also introduce a simple method to transfer
    predictive power from structure to features via unlabeled data, while
    incurring a minimal statistical penalty.
  },
  keywords = {nlp ml sp},
  tagline = {We investigate the trade-off between structured models and feature-rich models, empirically and theoretically. We also present a way to trade structure for features using unlabeled data.},
  url = {http://pub.hal3.name/#daume08flat}
}

@inproceedings{daume08hbc,
  title     = {{HBC}: Hierarchical Bayes Compiler},
  author    = { Daum\'e, III, Hal},
  booktitle = {Workshop on Bayesian Inference},
  year      = {2008},
  abstract  = {
    These goals distinguish HBC from other Bayesian modeling software, such as Bugs (or WinBugs [3]). In particular, our primary goal is that models created in HBC can be used directly, rather than only as a firstpass test. Moreover, we aim for scalability with respect to data size. Finally, since the goal of HBC is to compile hierarchical models into standard programming languages (like C), these models can easily be used as part of a larger system. This last point is in the spirit of the dynamic programming language Dyna [2].
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume08hbc},
}
@InProceedings{daume08hints,
  author =       {Hal {Daum\'e III}},
  title =        {Cross-Task Knowledge-Constrained Self Training},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2008},
  address =      {Honolulu, Hawaii},
  abstract =     {
    We present an algorithmic framework for learning multiple related
    tasks.  Our framework exploits a form of prior knowledge that relates
    the output spaces of these tasks.  We present PAC learning results
    that analyze the conditions under which such learning is possible.
    We present results on learning a shallow parser and named-entity
    recognition system that exploits our framework, showing consistent
    improvements over baseline methods.
  },
  keywords = {nlp ml sp da},
  tagline = {We present a simple algorithm for self-training with knowledge that tells us when outputs for multiple tasks are "compatible." We can learn to do NER on a tiny amount of labeled data by this method.},
  url = {http://pub.hal3.name/#daume08hints}
}

@InProceedings{daume08ihfrm,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {The Infinite Hierarchical Factor Regression Model},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2008},
  address =      {Vancouver, Canada},
  abstract =     {
  },
  keywords = {ml bayes},
  tagline = {We address shortcomings in factor analysis by: (1) we do not assume a known number of factors; (2) we do not assume factors are independent; (3) we do not assume all features are relevant to the factor analysis. We apply this to microarray analysis data.},
  url = {http://pub.hal3.name/#daume08ihfrm}
}
@InProceedings{daume08transliterate,
  author =       {Ulf Hermjakob and Kevin Knight and Hal {Daum\'e III}},
  title =        {Name Translation in Statistical Machine Translation: Learning When to Transliterate},
  booktitle =    {Conference of the Association for Computational Linguistics (ACL)},
  year =         {2008},
  address =      {Columbus, OH},
  abstract =     {
    We present a method to transliterate names in the framework of
    end-to-end statistical machine translation.  The system is trained
    to learn when to transliterate.  For Ararbic to English MT, we 
    developed and trained a transliterator on a bitext of 7 million
    sentences and Google's English terabyte ngrams and achieved better
    name translation accuracy than 3 out of 4 professional
    translators.  The paper also includes a discussion of challenges
    in name translation evaluation.
  },
  keywords = {nlp},
  tagline = {We transliterate names in an end-to-end MT system, where names are explicitly tagged with there to transliterate or not. Our transliterator outperforms 3 of 4 human translators.},
  url = {http://pub.hal3.name/#daume08transliterate}
}

@InProceedings{daume09areal,
  author =       {Hal {Daum\'e III}},
  title =        {Non-Parametric {B}ayesian Model Areal Linguistics},
  booktitle =    {North American Chapter of the Association for Computational Linguistics (NAACL)},
  year =         {2009},
  address =      {Boulder, CO},
  abstract =     {
    We describe a statistical model over linguistic areas and phylogeny.
    Our model recovers known areas and identifies a plausible hierarchy
    of areal features.  The use of areas improves genetic reconstruction
    of languages both qualitatively and quantitatively according to a
    variety of metrics.  We model linguistic areas by a Pitman-Yor
    process and linguistic phylogeny by Kingman's coalescent.
  },
  keywords = {nlp bayes ling},
  tagline = {We present a model of linguistic phylogenetics that takes into account areas affects in a non-parametric way. Our model simultaneously discovers phylogenetic trees, areal clusters and areal features.},
  url = {http://pub.hal3.name/#daume09areal}
}

@InProceedings{daume09cca,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Multi-Label Prediction via Sparse Infinite {CCA}},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2009},
  address =      {Vancouver, Canada},
  abstract =     {
    Canonical Correlation Analysis (CCA) is a useful technique for modeling dependencies
    between two (or more) sets of variables. Building upon the recently
    suggested probabilistic interpretation of CCA, we propose a nonparametric,
    fully Bayesian framework that can automatically select the number of correlation
    components, and effectively capture the sparsity underlying the projections.
    In addition, given (partially) labeled data, our algorithm can also be used as a
    (semi)supervised dimensionality reduction technique, and can be applied to learn
    useful predictive features in the context of learning a set of related tasks. Experimental
    results demonstrate the efficacy of the proposed approach for both CCA as
    a stand-alone problem, and when applied to multi-label prediction.
  },
  keywords = {ml bayes},
  tagline = {Using an infinte variant of CCA, we propose a non-parametric Bayesian framework for semi-supervised dimensionality reduction for multitask learning and multi-label prediction.},
  url = {http://pub.hal3.name/#daume09cca}
}
@Article{daume09graining,
  author =       {Pu Liu and Qiang Shi and Hal {Daum\'e III} and Gregory Voth},
  title =        {A Bayesian Statistics Approach to Multiscale Coarse Graining},
  journal =      {Journal of Chemical Physics (J.ChPhys)},
  year =         {2009},
  volume =       {129},
  number =       {21},
  pages =        {214114},
  month =        {December},
  abstract = {
    Coarse-grained (CG) modeling provides a promising way to
    investigate many important physical and biological phenomena over
    large spatial and temporal scales. The multiscale coarse-graining
    (MS-CG) method has been proven to be a thermodynamically
    consistent way to systematically derive a CG model from atomistic
    force information, as shown in a variety of systems, ranging from
    simple liquids to proteins embedded in lipid bilayers. In the
    present work, Bayes' theorem, an advanced statistical tool widely
    used in signal processing and pattern recognition, is adopted to
    further improve the MS-CG force field obtained from the CG
    modeling. This approach can regularize the linear equation
    resulting from the underlying force-matching methodology,
    therefore substantially improving the quality of the MS-CG force
    field, especially for the regions with limited sampling. Moreover,
    this Bayesian approach can naturally provide an error estimation
    for each force field parameter, from which one can know the extent
    the results can be trusted. The robustness and accuracy of the
    Bayesian MS-CG algorithm is demonstrated for three different
    systems, including simple liquid methanol, polyalanine peptide
    solvated in explicit water, and a much more complicated peptide
    assembly with 32 NNQQNY hexapeptides.
  },
  keywords = {ml bayes},
  tagline = {We apply Bayesian analysis to model a multiscale coarse-grained force field in a thermodynamically consistent way. The robustness and accuracy of the Bayesian MS-CG algorithm is demonstrated for three different systems, including simple liquid methanol, polyalanine peptide solvated in explicit water, and a much more complicated peptide assembly with 32 NNQQNY hexapeptides.},
}
@InProceedings{daume09hetero,
  author =       {Amrish Kapoor and Piyush Rai and Hal {Daum\'e III}},
  title =        {Factor Regression Combining Heterogeneous Sources of Information},
  booktitle =    {Proceedings of NeurIPS Workshop on Learning From Multiple Sources with Applications to Robotics (LMS)},
  year =         {2009},
  address =      {Vancouver, Canada},
  abstract =     {
    We present a non-parametric Bayesian factor regression model that
    combines two heterogeneous sources of information: gene expression arrays
    and text from their corresponding PubMed abstracts. Our model
    approximates a pLSI style model and results in improved regression
    accuracy. We apply this model to gene-expression data analysis, but it is
    extendable to other problems exhibiting a similar heterogeneous
    multiplicity in sources of information, like financial analysis, weather
    prediction and others.
  },
  keywords = {ml bayes},
  tagline = {We combine text and microarray data in a factor regression model to achieve higher prediction accuracy.},
  url = {http://pub.hal3.name/#daume09hetero}
}
@InProceedings{daume09hiermtl,
  author =       {Hal {Daum\'e III}},
  title =        {Bayesian Multitask Learning with Latent Hierarchies},
  booktitle =    {Conference on Uncertainty in Artificial Intelligence (UAI)},
  year =         {2009},
  address =      {Montreal, Canada},
  abstract =     {
    We learn multiple hypotheses for related tasks under a latent
    hierarchical relationship between tasks.  We exploit the intuition
    that for \emph{domain adaptation}, we wish to share classifier
    structure, but for \emph{multitask learning}, we wish to share
    covariance structure.  Our hierarchical model is seen to subsume
    several previously proposed multitask learning models and performs
    well on three distinct real-world data sets.
  },
  keywords = {ml bayes da},
  tagline = {We show that hierarchical multitask learning can be accomplished even when the hierarchical structure is not known in advance. We use the <a href="#daume07coalescent">coalescent</a> to make this happen.},
  url = {http://pub.hal3.name/#daume09hiermtl}
}

@InProceedings{daume09hybrid,
  author =       {Arvind Agarwal and Hal {Daum\'e III}},
  title =        {Exponential Family Hybrid Semi-Supervised Learning},
  booktitle =    {International Joint Conference on Artificial Intelligence (IJCAI)},
  year =         {2009},
  address =      {Pasadena, CA},
  abstract =     {
    We present an approach to semi-supervised learning
    based on an exponential family characterization.
    Our approach generalizes previous work on
    coupled priors for hybrid generative/discriminative
    models. Our model is more flexible and natural
    than previous approaches. Experimental results on
    several data sets show that our approach also performs
    better in practice.
  },
  keywords = {ml},
  tagline = {We generalize the coupled prior approach to hybrid models probabilistic models to arbitrary exponential family distributions. Experiments show that this is a good idea.},
  url = {http://pub.hal3.name/#daume09hybrid},
  award = {Best Paper Award},
}

@InProceedings{daume09ibpsearch,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Fast Search for Infinite Latent Feature Models},
  booktitle =    {Proceedings of NeurIPS Workshop on Non-parametric Bayes (NP-Bayes)},
  year =         {2009},
  address =      {Vancouver, Canada},
  abstract =     {
    We propose several search based alternatives for inference in the Indian Buffet
    Process (IBP) based models. We consider the case when we only want a maximum
    a posteriori (MAP) estimate of the latent feature assignment matrix. If
    true posterior samples are required, these MAP estimates can also serve as intelligent
    initializers for MCMC based algorithms. Another advantage of the proposed
    methods is that they can process one observation at a time making it possible to
    do inference in an online setting. Experimental evidences suggest that these algorithms
    can give us computational benefits of an order of magnitude over Gibbs
    sampling (or its sequential variant - the particle filter) traditionally used in IBP
    based models.
  },
  keywords = {ml bayes},
  tagline = {We show that, like A* and beam search can solve MAP for Dirichlet Processes, it can also be applied to the Indian Buffet Process.},
  url = {http://pub.hal3.name/#daume09ibpsearch}
}
@InProceedings{daume09mrtf,
  author =       {Hal {Daum\'e III}},
  title =        {Markov Random Topic Fields},
  booktitle =    {Association for Computational Linguistics (ACL)},
  year =         {2009},
  address =      {Singapore},
  abstract =     {
    Most approaches to topic modeling assume an independence between
    documents that is frequently violated.  We present an topic model
    that makes use of one or more user-specified graphs describing
    relationships between documents.  These graph are encoded in the
    form of a Markov random field over topics and serve to encourage
    related documents to have similar topic structures.  Experiments on
    show upwards of a $10\%$ improvement in modeling performance.
  },
  keywords = {nlp bayes ml},
  tagline = {We show how to integrate topic models in an undirected graph for topic mining in -- for instance -- scientific publications. We explore several different model parameterizations.},
  url = {http://pub.hal3.name/#daume09mrtf}
}

@InProceedings{daume09onepass,
  author =       {Piyush Rai and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Streamed Learning: One-Pass {SVM}s},
  booktitle =    {International Joint Conference on Artificial Intelligence (IJCAI)},
  year =         {2009},
  address =      {Pasadena, CA},
  abstract =     {
    We present a streaming model for large-scale classification
    (in the context of l2-SVM) by leveraging
    connections between learning and computational
    geometry. The streaming model imposes the constraint
    that only a single pass over the data is allowed.
    The l2-SVMis known to have an equivalent
    formulation in terms of theminimumenclosing ball
    (MEB) problem, and an efficient algorithm based
    on the idea of core sets exists (CVM) [Tsang et al.,
    2005]. CVM learns a (1+ε)-approximate MEB for
    a set of points and yields an approximate solution
    to corresponding SVM instance. However CVM
    works in batch mode requiringmultiple passes over
    the data. This paper presents a single-pass SVM
    which is based on the minimum enclosing ball of
    streaming data. We show that the MEB updates for
    the streaming case can be easily adapted to learn the
    SVM weight vector in a way similar to using online
    stochastic gradient updates. Our algorithmperforms
    polylogarithmic computation at each example,
    and requires very small and constant storage.
    Experimental results show that, even in such restrictive
    settings, we can learn efficiently in just one
    pass and get accuracies comparable to other stateof-
    the-art SVM solvers (batch and online). We also
    give an analysis of the algorithm, and discuss some
    open issues and possible extensions.
  },
  keywords = {ml},
  tagline = {We present a one-pass approach to learning in the support vector machine framework. Our algorithm leverages the computational geometry view of SVMs as minimum enclosing balls, plus results on streaming MEBs.},
  url = {http://pub.hal3.name/#daume09onepass}
}

@article{daume09searn,
  author =       {Hal {Daum\'e III} and John Langford and Daniel Marcu},
  title =        {Search-based Structured Prediction},
  year =         {2009},
  booktitle =    {Machine Learning Journal (MLJ)},
  abstract =     {
    We present Searn, an algorithm for integrating search and
    learning to solve complex structured prediction problems such
    as those that occur in natural language, speech, computational
    biology, and vision.  Searn is a meta-algorithm that transforms
    these complex problems into simple classification problems to which
    any binary classifier may be applied.  Unlike current algorithms for
    structured learning that require decomposition of both the loss
    function and the feature functions over the predicted structure,
    Searn is able to learn prediction functions for any loss
    function and any class of features.  Moreover, Searn comes
    with a strong, natural theoretical guarantee: good performance on the
    derived classification problems implies good performance on the
    structured prediction problem.
  },
  keywords = {sp nlp ml},
  tagline = {We describe an algorithm, <i>Searn</i> for solving structured prediction problems for cases where neither the loss nor the features decompose over the structure. This algorithm comes with reduction guarantees about performance, and strong empirical results in both standard sequence labeling tasks and a novel document summarization task.},
  url = {http://pub.hal3.name/#daume09searn}
}

@Misc{daume09sslnlp,
  author =       {Hal {Daum\'e III}},
  title =        {Semi-supervised or Semi-unsupervised?},
  howpublished =    {Invited paper: NAACL-HLT Workshop on Semi-supervised Learning in NLP (SSLNLP)},
  year =         {2009},
  address =      {Boulder, CO},
  keywords = {nlp ml},
  tagline = {Are you doing learning with labeled and unlabeled data in such a way that it would work with only unlabeled data or such that it would work with only labeled data? Why not both? This is an invited position paper for the SSLNLP workshop.},
  url = {http://pub.hal3.name/#daume09sslnlp}
}
@InProceedings{daume09streaming,
  author =       {Amit Goyal and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Streaming for Large Scale {NLP}: Language Modeling},
  booktitle =    {North American Chapter of the Association for Computational Linguistics (NAACL)},
  year =         {2009},
  address =      {Boulder, CO},
  abstract =     {
    In this paper, we explore a streaming algorithm paradigm to handle
    large amounts of data for NLP problems. We present an efficient
    low-memory method for constructing high-order approximate n-gram
    frequency counts. The method is based on a deterministic streaming
    algorithm which efficiently computes approximate frequency counts
    over a stream of data while employing a small memory footprint. We
    show that this method easily scales to billion-word monolingual
    corpora using a conventional (4 GB RAM) desktop
    machine. Statistical machine translation experimental results
    corroborate that the resulting high-n approximate small language
    model is as effective as models obtained from other count pruning
    methods.
  },
  keywords = {nlp},
  tagline = {We describe an approach to language modeling based on the streaming model of algorithms. We show that a single-pass, high efficiency approximate counting method can lead to tiny language models that perform as well as forever-to-build entropy-pruned language models.},
  url = {http://pub.hal3.name/#daume09streaming}
}

@InProceedings{daume09subspacemtl,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Multitask Learning using Nonparametrically Learned Predictor Subspaces},
  booktitle =    {NeurIPS Workshop on Learning from Multiple Sources},
  year =         {2009},
  address =      {Whistler, Canada},
  abstract = {
    Given several related learning tasks, we propose a nonparametric
    Bayesian learning model that captures task relatedness by assuming
    that the task parameters (i.e., weight vectors) share a latent
    subspace. More specifically, the intrinsic dimensionality of this
    subspace is not assumed to be known a priori. We use an infinite
    latent feature model - the Indian Buffet Process - to
    automatically infer this number. We also propose extensions of
    this model where the subspace learning can incorporate (labeled,
    and additionally unlabeled if available) examples, or the task
    parameters share a mixture of subspaces, instead of sharing a
    single subspace. The latter property can allow learning nonlinear
    manifold structure underlying the task parameters, and can also
    help in preventing negative transfer from outlier tasks.
  },
  keywords = {ml bayes},
  url = {http://pub.hal3.name/#daume09subspacemtl}
}
@InProceedings{daume09typpos,
  author =       {Adam R. Teichert and Hal {Daum\'e III}},
  title =        {Unsupervised Part of Speech Tagging Without a Lexicon},
  booktitle =    {NeurIPS Workshop on Grammar Induction, Representation of Language and Language Learning (GIRLLL)},
  year =         {2009},
  address =      {Vancouver, Canada},
  abstract = {
  },
  keywords = {nlp bayes},
  tagline = {We show how to use simple typological knowledge to improve unsupervised part of speech tagging. The basic result is that we can do without seed lexicons if we know a little linguistics.},
  url = {http://pub.hal3.name/#daume09typpos}
}
@InProceedings{daume09unsearn,
  author =       {Hal {Daum\'e III}},
  title =        {Unsupervised Search-based Structured Prediction},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2009},
  address =      {Montreal, Canada},
  abstract =     {
    We describe an adaptation and application of a search-based
    structured prediction algorithm "Searn" to unsupervised learning
    problems.  We show that it is possible to reduce unsupervised
    learning to supervised learning and demonstrate a high-quality
    unsupervised shift-reduce parsing model.  We additionally show a
    close connection between unsupervised Searn and expectation
    maximization.  Finally, we demonstrate the efficacy of a
    semi-supervised extension.  The key idea that enables this
    is an application of the predict-self idea for
    unsupervised learning.
  },
  keywords = {nlp ml sp},
  tagline = {We extend the <a href="http://searn.hal3.name">Searn</a> algorithm to handle unsupervised learning; experiments on dependency parsing are promising and easy to implement.},
  url = {http://pub.hal3.name/#daume09unsearn}
}

@inproceedings{daume10aoml,
  author =       {Avishek Saha and Piyush Rai and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Active Online Multitask Learning},
  booktitle =    {ICML 2010 Workshop on Budgeted Learning (Budget)},
  year =         {2010},
  address =      {Haifa, Israel},
  abstract = {
    In this paper, we propose an online multitask
    learning framework where the weight
    vectors are updated in an adaptive fashion
    based on inter-task relatedness. Our work
    is in contrast with the earlier work on online
    multitask learning (Cavallanti et al.,
    2008) where the authors use a fixed interaction
    matrix of tasks to derive (fixed) update
    rules for all the tasks. In this work,
    we propose to update this interaction matrix
    itself in an adaptive fashion so that
    the weight vector updates are no longer
    fixed but are instead adaptive. Our framework
    can be extended to an active learning
    setting where the informativeness of an
    incoming instance across all the tasks can
    be evaluated using this adaptive interaction
    matrix. Empirical results on standardized
    datasets show improved performance
    in terms of accuracy, label complexity and
    number of mistakes made.
  },
  keywords = {ml},
  tagline = {We present a simple perceptron-like algorithm for active learning in an online, domain adaptation setting.  Theory and experiments, both.},
}
@inproceedings{daume10clustering,
  author =       {Anusua Trivedi and Piyush Rai and Scott L. DuVall and Hal {Daum\'e III}},
  title =        {Exploiting Tag and Word Correlations for Improved Webpage Clustering},
  booktitle =    {Proceedings of {CIKM} Workshop on Search and Mining User-generated Contents (SMUC)},
  year =         {2010},
  address =      {Toronto, Canada},
  abstract = {
    Automatic clustering of webpages helps a number of information
    retrieval tasks, such as improving user interfaces,
    collection clustering, introducing diversity in search results,
    etc. Typically, webpage clustering algorithms only use features
    extracted from the page-text. However, the advent
    of social-bookmarking websites, such as StumbleUpon1 and
    Delicious, has led to a huge amount of user-generated content
    such as the tag information that is associated with the
    webpages. In this paper, we present a subspace based feature
    extraction approach which leverages tag information to complement
    the page-contents of a webpage to extract highly
    discriminative features, with the goal of improved clustering
    performance. In our approach, we consider page-text and
    tags as two separate views of the data, and learn a shared
    subspace that maximizes the correlation between the two
    views. Any clustering algorithm can then be applied in this
    subspace. We compare our subspace based approach with a
    number of baselines that use tag information in various other
    ways, and show that the subspace based approach leads to
    improved performance on the webpage clustering task. Although
    our results here are on the webpage clustering task,
    the same approach can be used for webpage classification as
    well. In the end, we also suggest possible future work for
    leveraging tag information in webpage clustering, especially
    when tag information is present for not all, but only for a
    small number of webpages.
  },
  keywords = {ml},
  tagline = {We use kernel CCA as a multiview learning approach to clustering web pages based on both their content, and their associated social tags.},
}
@article{daume10conjugate,
  author =       {Arvind Agarwal and Hal {Daum\'e III}},
  title =        {A geometric view of conjugate priors},
  year =         {2010},
  booktitle =    {Machine Learning Journal (MLJ)},
  volume =       {81},
  number =       {1},
  abstract =     {
    In Bayesian machine learning, conjugate priors are popular, mostly due
    to mathematical convenience. In this paper, we show that there are
    deeper reasons for choosing a conjugate prior. Specifically, we
    formulate the conjugate prior in the form of Bregman divergence and
    show that it is the inherent geometry of conjugate priors that makes
    them appropriate and intuitive. This geometric interpretation allows
    one to view the hyperparameters of conjugate priors as the effective
    sample points, thus providing additional intuition. We use this
    geometric understanding of conjugate priors to derive the
    hyperparameters and expression of the prior used to couple the
    generative and discriminative components of a hybrid model for
    semi-supervised learning.
  },
  keywords = {ml bayes},
  tagline = {We give a geometric interpretation of conjugate priors, showing that they are the natural class of priors under certain information geometric analyses.},
  url = {http://pub.hal3.name/#daume10conjugate}
}

@InProceedings{daume10coreg,
  author =       {Abhishek Kumar and Avishek Saha and Hal {Daum\'e III}},
  title =        {A Co-regularization Based Semi-supervised Domain Adaptation},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2010},
  address =      {Vancouver, Canada},
  abstract =     {
    This paper presents a co-regularization based approach to
    semi-supervised domain adaptation. Our proposed approach (EA++) builds
    on the notion of augmented space (introduced in EASYADAPT (EA) [1])
    and harnesses unlabeled data in target domain to further enable the
    transfer of information from source to target. This semi-supervised
    approach to domain adaptation is extremely simple to implement and can
    be applied as a pre-processing step to any supervised learner. Our
    theoretical analysis (in terms of Rademacher complexity) of EA and
    EA++ show that the hypothesis class of EA++ has lower complexity
    (compared to EA) and hence results in tighter generalization bounds.
    Experimental results on sentiment analysis tasks reinforce our
    theoretical findings and demonstrate the efficacy of the proposed
    method when compared to EA as well as a few other baseline approaches.
  },
  keywords = {ml da},
  tagline = {We perform a theoretical analysis of EasyAdapt (my ACL 2007 paper) using a coregularization approach, and then derive, from this theory, a semi-supervised extension that's equally easy to implement as a preprocessing step.},
  url = {http://pub.hal3.name/#daume10coreg}
}
@InProceedings{daume10daal,
  author =       {Piyush Rai and Avishek Saha and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Domain Adaptation meets Active Learning},
  booktitle =    {Proceedings of HLT/NAACL Workshop on Active Learning for NLP (ALNLP)},
  year =         {2010},
  address =      {Los Angeles, CA},
  abstract =     {
    In this work, we show how active learning
    in some (target) domain can leverage information
    from a different but related (source)
    domain. We present an algorithm that harnesses
    the source domain data to learn the best
    possible initializer hypothesis for doing active
    learning in the target domain, resulting in improved
    label complexity. We also present a
    variant of this algorithm which additionally
    uses the domain divergence information to selectively
    query the most informative points in
    the target domain, leading to further reductions
    in label complexity. Experimental results
    on a variety of datasets establish the efficacy
    of the proposed methods.
  },
  keywords = {ml da},
  tagline = {We present a simple algorithm for active domain adaptation that achieves significant reductions in sample complexity.},
  url = {http://pub.hal3.name/#daume10daal}
}
@InProceedings{daume10distsim,
  author =       {Amit Goyal and Jagadeesh Jagarlamudi and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Sketch Techniques for Scaling Distributional Similarity to the Web},
  booktitle =    {GEometrical Models of Natural Language Semantics Workshop (GEMS) at ACL},
  year =         {2010},
  address =      {Uppsala, Sweden},
  abstract =     {
    In this paper, we propose a memory, space,
    and time efficient framework to scale distributional
    similarity to the web. We
    exploit sketch techniques, especially the
    Count-Min sketch, which approximates
    the frequency of an item in the corpus
    without explicitly storing the item itself.
    These methods use hashing to deal with
    massive amounts of the streaming text. We
    store all item counts computed from 90
    GB of web data in just 2 billion counters
    (8 GB main memory) of CM sketch.
    Our method returns semantic similarity
    between word pairs in O(K) time and
    can compute similarity between any word
    pairs that are stored in the sketch. In our
    experiments, we show that our framework
    is as effective as using the exact counts.
  },
  keywords = {nlp},
  tagline = {We show how to efficiently compute distributional similarities from 90 GB of web data in 8 GB of RAM using count-min sketches.},
  url = {http://pub.hal3.name/#daume09mrtf}
}

@inproceedings{daume10easyss,
  title     = {Frustratingly Easy Semi-Supervised Domain Adaptation},
  author    = {Hal {Daum\'e III} and Abhishek Kumar and Avishek Saha},
  booktitle = {Workshop on Domain Adaptation for NLP},
  year      = {2010},
  abstract  = {
    In this work, we propose a semi-supervised extension to a
    well-known supervised domain adaptation approach (EA) (Daume III,
    2007). Our proposed approach (EA++) builds on the notion of
    augmented space (introduced in EA) and harnesses unlabeled data
    in target domain to ameliorate the transfer of information from
    source to target. This semi-supervised approach to domain
    adaptation is extremely simple to implement, and can be applied
    as a pre-processing step to any supervised learner. Experimental
    results on sequential labeling tasks demonstrate the efficacy of
    the proposed method.
  },
  keywords  = {nlp, ml},
  url       = {http://pub.hal3.name/#daume10easyss},
}
@InProceedings{daume10manifold,
  author =       {Arvind Agarwal and Samuel Gerber and Hal {Daum\'e III}},
  title =        {Learning Multiple Tasks using Manifold Regularization},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2010},
  address =      {Vancouver, Canada},
  abstract =     {
    We present a novel method for multitask learning (MTL) based on
    manifold regularization.  We assume that all task parameters lie on a
    manifold which is the generalization of the assumption made in the
    existing literature i.e., task parameters share a common linear
    subspace. The proposed method uses the projection distance from the
    manifold to regularize the task parameters. The manifold structure and
    the task parameters are learned using an alternating optimization
    framework.  When the manifold structure is fixed, our method
    decomposes into learning independent tasks, making it appealing for
    learning new tasks. An approximation of the manifold regularization
    scheme is presented that preserves the convexity of the single task
    learning problem, and makes the proposed MTL framework efficient and
    easy to implement. We show the efficacy of our method on several
    datasets.
  },
  keywords = {ml da},
  tagline = {We approach multitask learning by assuming that all task parameters lie on an unkown manifold; we use a varient of manifold regularization to ensure convexity of the resulting single task learning problems.},
  url = {http://pub.hal3.name/#daume10manifold}
}
@InProceedings{daume10mtlmls,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Multitask Learning via Mixture of Linear Subspaces},
  booktitle =    {NeurIPS Workshop on Transfer Learning by Learning Rich Generative Models},
  year =         {2010},
  address =      {Whistler, Canada},
  abstract = { 
    We propose a probabilistic generative model for
    multitask learning that exploits the cluster
    structure of the task parameters, and additionally
    imposes a low-rank constraint on the set of task
    parameters within each cluster. This leads to a
    sharing of statistical strengths of multiple tasks
    at two levels: (1) via cluster assumption, and (2)
    via a subspace assumption within each cluster. Our
    work brings in the benefits of both these aspects of
    task relationship, each of which has been addressed
    only individually in prior work. We assume a mixture
    of linear subspaces model on the latent task
    parameters that can capture both these aspects
    simultaneously.  Furthermore, the mixture of
    subspaces assumption can model the fact that the
    task parameters could potentially live on a
    non-linear manifold instead of a linear subspace
    which is a restriction of earlier work on multitask
    learning based on the linear subspace assumption.
  },
  keywords = {ml bayes},
  url = {http://pub.hal3.name/#daume10mtlmls}
}
@InProceedings{daume10multilingual,
  author =       {Jagadeesh Jagarlamudi and Hal {Daum\'e III}},
  title =        {Extracting Multilingual Topics from Unaligned Corpora},
  booktitle =    {Proceedings of the European Conference on Information Retrieval (ECIR)},
  year =         {2010},
  address =      {Milton Keynes, United Kingdom},
  abstract =     {
    Topic models have been studied extensively in the context of
    monolingual corpora. Though there are some attempts to mine topical
    structure from cross-lingual corpora, they require clues about document
    alignments. In this paper we present a generative model called JointLDA
    which uses a bilingual dictionary to mine multilingual topics from an
    unaligned corpus. Experiments conducted on different data sets confirm
    our conjecture that jointly modeling the cross-lingual corpora offers several
    advantages compared to individual monolingual models. Since the
    JointLDA model merges related topics in different languages into a single
    multilingual topic: a) it can fit the data with relatively fewer topics. b)
    it has the ability to predict related words from a language different than
    that of the given document. In fact it has better predictive power compared
    to the bag-of-word based translation model leaving the possibility
    for JointLDA to be preferred over bag-of-word model for cross-lingual
    IR applications. We also found that the monolingual models learnt while
    optimizing the cross-lingual copora are more effective than the corresponding
    LDA models.
  },
  keywords = {nlp ml},
  tagline = {We show how topic models can be used to model unaligned multilingual corpora, such as Wikipedia.},
  url = {http://pub.hal3.name/#daume10multilingual}
}
@InProceedings{daume10mvincomplete,
  author =       {Piyush Rai and Anusua Trivedi and Hal {Daum\'e III} and Scott L. DuVall},
  title =        {Multiview Clustering with Incomplete Views},
  booktitle =    {NeurIPS Workshop on Machine Learning for Social Computing},
  year =         {2010},
  address =      {Whistler, Canada},
  abstract = { 
    Multiview clustering algorithms allow leveraging
    information frommultiple views of the data and
    therefore lead to improved clustering. A number of
    kernel based multiview clustering algorithms work by
    using the kernel matrices defined on the different
    views of the data. However, these algorithms assume
    availability of features from all the views of each
    example, i.e., assume that the kernel matrix for
    each view is complete. We present an approach that
    allows these algorithms to be applicable even when
    only one (the primary) view is complete and the
    auxiliary views are incomplete (i.e., features from
    these views are available only for some of the
    examples). Taking the kernel CCA based multiview
    clustering as an example, we apply our method on
    webpage clustering with multiple views of the data
    where one view is the page-text and other view is
    the social tags assigned to the webpage. We consider
    the case when the tags are available only for a
    small subset of the webpages which means that the
    tag view is incomplete. Experimental results
    establish the effectiveness of the proposed method.
  },
  keywords = {ml bayes},
  url = {http://pub.hal3.name/#daume10mvincomplete}
}
@InProceedings{daume10plotunits-emnlp,
  author =       {Amit Goyal and Ellen Riloff and Hal {Daum\'e III}},
  title =        {Automatically Producing Plot Unit Representations for Narrative Text},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2010},
  address =      {Boston, MA},
  abstract =     {
  },
  keywords = {nlp},
  tagline = {We discuss the task of affect state analysis in the context of the Plot Units formalism, and how it differs from sentiment analysis. We show how existing resources, together with linguistically-inspired projection rules, and affective verbs extracted from large corpora can be put together to build a system for affect state identification in fables.},
  url = {http://pub.hal3.name/#daume10daal}
}
@InProceedings{daume10plotunits,
  author =       {Amit Goyal and Ellen Riloff and Hal {Daum\'e III} and Nathan Gilbert},
  title =        {Toward Plot Units: Automatic Affect State Analysis},
  booktitle =    {Proceedings of HLT/NAACL Workshop on Computational Approaches to Analysis and Generation of Emotion in Text (CAET)},
  year =         {2010},
  address =      {Los Angeles, CA},
  abstract =     {
    We present a system called AESOP that automatically
    produces affect states associated
    with characters in a story. This research represents
    a first step toward the automatic generation
    of plot unit structures from text. AESOP
    incorporates several existing sentiment analysis
    tools and lexicons to evaluate the effectiveness
    of current sentiment technology on this
    task. AESOP also includes two novel components:
    a method for acquiring patient polarity
    verbs, which impart negative affect on their
    patients, and affect projection rules to propagate
    affect tags from surrounding words onto
    the characters in the story. We evaluate AESOP
    on a small collection of fables.
  },
  keywords = {nlp},
  tagline = {We discuss the task of affect state analysis in the context of the Plot Units formalism, and how it differs from sentiment analysis. We show how existing resources, together with linguistically-inspired projection rules, and affective verbs extracted from large corpora can be put together to build a system for affect state identification in fables.},
  url = {http://pub.hal3.name/#daume10plotunits}
}
@InProceedings{daume10sketch,
  author =       {Amit Goyal and Jagadeesh Jagarlamudi and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Sketching Techniques for Large Scale {NLP}},
  booktitle =    {Proceedings of HLT/NAACL Workshop on the Web as a Corpus (WAC)},
  year =         {2010},
  address =      {Los Angeles, CA},
  abstract =     {
    In this paper, we address the challenges
    posed by large amounts of text data by
    exploiting the power of hashing in the
    context of streaming data. We explore
    sketch techniques, especially the Count-
    Min Sketch, which approximates the frequency
    of a word pair in the corpus without
    explicitly storing the word pairs themselves.
    We use the idea of a conservative
    update with the Count-Min Sketch to reduce
    the average relative error of its approximate
    counts by a factor of two. We
    show that it is possible to store all words
    and word pairs counts computed from 37
    GB of web data in just 2 billion counters
    (8 GB RAM). The number of these counters
    is up to 30 times less than the stream
    size which is a big memory and space gain.
    In Semantic Orientation experiments, the
    PMI scores computed from 2 billion counters
    are as effective as exact PMI scores.
 },
  keywords = {nlp},
  tagline = {We demonstrate the power of hashing techniques (including the count-min sketch) for counting word pairs. This leads to small, efficient models for computing pairwise mutual information of any two words.},
  url = {http://pub.hal3.name/#daume10sketch}
}
@InProceedings{daume10sorting,
  author =       {Jagadeesh Jagarlamudi and Seth Juarez and Hal {Daum\'e III}},
  title =        {Kernelized Sorting for Natural Language Processing},
  booktitle =    {Proceedings of the Conference on Artificial Intelligence (AAAI)},
  year =         {2010},
  address =      {Atlanta, Georgia},
  abstract =     {
    Kernelized sorting is an approach for matching objects from
    two sources (or domains) that does not require any prior notion
    of similarity between objects across the two sources. Unfortunately,
    this technique is highly sensitive to initialization
    and high dimensional data. We present variants of kernelized
    sorting to increase its robustness and performance on several
    Natural Language Processing (NLP) tasks: document matching
    from parallel and comparable corpora, machine transliteration
    and even image processing. Empirically we show that,
    on these tasks, a semi-supervised variant of kernelized sorting
    outperforms matching canonical correlation analysis.
  },
  keywords = {nlp ml},
  tagline = {We adapt the kernelized sorting algorithm for NLP tasks, and introduce a semi-supervised variant. We demonstrate its performance on document matching, transliteration and image processing.},
  url = {http://pub.hal3.name/#daume10sorting}
}
@InProceedings{daume10spectral,
  author =       {Abhishek Kumar and Piyush Rai and Hal {Daum\'e III}},
  title =        {Co-regularized Spectral Clustering with Multiple Kernels},
  booktitle =    {NeurIPS Workshop on New Directions in Multiple Kernel Learning},
  year =         {2010},
  address =      {Whistler, Canada},
  abstract = { 
    We propose a co-regularization based multiview spectral
    clustering algorithm which enforces the clusterings
    across multiple views to agree with each-other.
    Since each view can be used to define a similarity
    graph over the data, our algorithm can also be
    considered as learning with multiple similarity
    graphs, or equivalently with multiple kernels. We
    propose an objective function that implicitly
    combines two (or more) kernels, and leads to an
    improved clustering performance.  Experimental
    comparisons with a number of baselines on several
    datasets establish the efficacy of our proposed
    approach.
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume10spectral}
}@InProceedings{daume10subspace,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Infinite Predictor Subspace Models for Multitask Learning},
  booktitle =    {Proceedings of the Conference on Artificial Intelligence and Statistics (AI-Stats)},
  year =         {2010},
  address =      {Sardinia, Italy},
  abstract =     {
    Given several related learning tasks, we propose
    a nonparametric Bayesian model that
    captures task relatedness by assuming that
    the task parameters (i.e., predictors) share
    a latent subspace. More specifically, the intrinsic
    dimensionality of the task subspace is
    not assumed to be known a priori. We use an
    infinite latent feature model to automatically
    infer this number (depending on and limited
    by only the number of tasks). Furthermore,
    our approach is applicable when the underlying
    task parameter subspace is inherently
    sparse, drawing parallels with l1 regularization
    and LASSO-style models. We also propose
    an augmented model which can make
    use of (labeled, and additionally unlabeled
    if available) inputs to assist learning this
    subspace, leading to further improvements
    in the performance. Experimental results
    demonstrate the efficacy of both the proposed
    approaches, especially when the number
    of examples per task is small. Finally, we
    discuss an extension of the proposed framework
    where a nonparametric mixture of linear
    subspaces can be used to learn a nonlin-
    ear manifold over the task parameters, and
    also deal with the issue of negative transfer
    from unrelated tasks.
  },
  keywords = {ml bayes da},
  tagline = {We present a non-parametric Bayesian model for multitask learning based on the assumption that task parameters live in a common, latent subspace.},
  url = {http://pub.hal3.name/#daume10subspace}
}
@InProceedings{daume11alda,
  author =       {Avishek Saha and Piyush Rai and Hal {Daum\'e III} and Suresh Venkatasubramanian and Scott L. DuVall},
  title =        {Active Supervised Domain Adaptation},
  booktitle =    {European Conference on Machine Learning (ECML)},
  year =         {2011},
  address =      {Athens, Greece},
  tags = {ml da},
  abstract = {
    In this paper, we harness the synergy between two important
    learning paradigms, namely, active learning and domain adaptation. We
    show how active learning in a target domain can leverage information
    from a different but related source domain. Our proposed framework, Active
    Learning Domain Adapted (ALDA), uses source domain knowledge
    to transfer information that facilitates active learning in the target domain.
    We propose two variants of ALDA: a batch B-ALDA and an online
    O-ALDA. Empirical comparisons with numerous baselines on real-world
    datasets establish the efficacy of the proposed methods.
  },
  url = {http://pub.hal3.name/#daume11alda},
}

@inproceedings{daume11cascades,
  title     = {Using Classifier Cascades for Scalable E-Mail Classification},
  author    = {Jay Pujara and Hal {Daum\'e III} and Lise Getoor},
  booktitle = {CEAS},
  year      = {2011},
  abstract  = {
    In many real-world scenarios, we must make judgments in
    the presence of computational constraints. One common
    computational constraint arises when the features used to
    make a judgment each have diﬀering acquisition costs, but
    there is a ﬁxed total budget for a set of judgments. Particularly when there are a large number of classiﬁcations
    that must be made in a real-time, an intelligent strategy for
    optimizing accuracy versus computational costs is essential.
    E-mail classiﬁcation is an area where accurate and timely
    results require such a trade-oﬀ. We identify two scenarios
    where intelligent feature acquisition can improve classiﬁer
    performance.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume11cascades},
  award = {Best Paper Award},
}
@InProceedings{daume11cospec,
  author =       {Abhishek Kumar and Hal {Daum\'e III}},
  title =        {A Co-training Approach for Multiview Spectral Clustering},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2011},
  address =      {Bellevue, WA},
  abstract =     {
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11cospec}
}
@Misc{daume11dunn,
  author = {Roger Levy and Hal {Daum\'e III}},
  title = {Computational methods are invaluable for typology, but the models must match the questions: Commentary on Dunn et al. (2011)},
  howpublished = {Journal of Linguistic Typology},
  year = {2011},
  keywords = {bayes ling},
  url = {http://pub.hal3.name/#daume11dunn}
}
@InProceedings{daume11generation,
  author =       {Yezhou Yang and Ching Lik Teo and Hal {Daum\'e III} and Yiannis Aloimonos},
  title =        {Corpus-Guided Sentence Generation of Natural Images},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2011},
  address =      {Edinburgh, Scotland},
  abstract =     {
    We propose a sentence generation strategy
    that describes images by predicting the most
    likely nouns, verbs, scenes and prepositions
    that make up the core sentence structure. The
    input are initial noisy estimates of the objects
    and scenes detected in the image using state of
    the art trained detectors. As predicting actions
    from still images directly is unreliable, we use
    a language model trained from the English Gigaword
    corpus to obtain their estimates; together
    with probabilities of co-located nouns,
    scenes and prepositions. We use these estimates
    as parameters on a HMM that models
    the sentence generation process, with hidden
    nodes as sentence components and image detections
    as the emissions. Experimental results
    show that our strategy of combining vision
    and language produces readable and descriptive
    sentences compared to naive strategies
    that use vision alone.
  },
  keywords = {nlp},
}
@InProceedings{daume11genkern,
  author =       {Arvind Agarwal and Hal {Daum\'e III}},
  title =        {Generative Kernels for Exponential Families},
  booktitle =    {Conference on Artificial Intelligence and Statistics (AI-Stats)},
  year =         {2011},
  address =      {Ft. Lauderdale, FL},
  abstract =     {
    In this paper, we propose a family of kernels
    for the data distributions belonging to
    the exponential family. We call these kernels
    generative kernels because they take into
    account the generative process of the data.
    Our proposed method considers the geometry
    of the data distribution to build a set
    of efficient closed-form kernels best suited for
    that distribution. We compare our generative
    kernels on multinomial data and observe
    improved empirical performance across the
    board. Moreover, our generative kernels perform
    signicantly better when training size is
    small, an important property of the generative
    models.
},
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11genkern}
}
@InProceedings{daume11ibpsearch,
  author =       {Piyush Rai and Hal {Daum\'e III}},
  title =        {Beam Search based MAP Estimates for the Indian Buffet Process},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2011},
  address =      {Bellevue, WA},
  abstract =     {
  },
  keywords = {ml bayes},
  url = {http://pub.hal3.name/#daume11ibpsearch}
}
@InProceedings{daume11interlingual,
  author =       {Jagadeesh Jagarlamudi and Hal {Daum\'e III} and Raghavendra Udupa},
  title =        {From Bilingual Dictionaries to Interlingual Document Representations},
  booktitle =    {Association for Computational Linguistics (ACL)},
  year =         {2011},
  address =      {Portland, OR},
  abstract =     {
    Mapping documents into an interlingual representation
    can help bridge the language barrier
    of a cross-lingual corpus. Previous approaches
    use aligned documents as training
    data to learn an interlingual representation,
    making them sensitive to the domain of the
    training data. In this paper, we learn an interlingual
    representation in an unsupervised
    manner using only a bilingual dictionary. We
    first use the bilingual dictionary to find candidate
    document alignments and then use them
    to find an interlingual representation. Since
    the candidate alignments are noisy, we develop
    a robust learning algorithm to learn
    the interlingual representation. We show that
    bilingual dictionaries generalize to different
    domains better: our approach gives better performance
    than either a word by word translation
    method or Canonical Correlation Analysis
    (CCA) trained on a different domain.
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume11interlingual}
}
@InProceedings{daume11lcu,
  author =       {Amit Goyal and Hal {Daum\'e III}},
  title =        {Lossy Conservative Update ({LCU}) sketch: Succinct approximate count storage},
  booktitle =    {Conference on Artificial Intelligence (AAAI)},
  year =         {2011},
  address =      {Portland, OR},
  abstract =     {
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume11lcu}
}
@InProceedings{daume11lexicaladapt,
  author =       {Hal {Daum\'e III} and Jagadeesh Jagarlamudi},
  title =        {Domain Adaptation for Machine Translation by Mining Unseen Words},
  booktitle =    {Association for Computational Linguistics},
  year =         {2011},
  address =      {Portland, OR},
  abstract =     {
    We show that unseen words account for a
    large part of the translation error when moving
    to new domains. Using an extension of
    a recent approach to mining translations from
    comparable corpora (Haghighi et al., 2008),
    we are able to find translations for otherwise
    OOV terms. We show several approaches
    to integrating such translations into a phrasebased
    translation system, yielding consistent
    improvements in translations quality (between
    0.5 and 1.5 Bleu points) on four domains and
    two language pairs.
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume11lexicaladapt}
}
@InProceedings{daume11mapmarg,
  author =       {Jiarong Jiang and Piyush Rai and Hal {Daum\'e III}},
  title =        {Message-Passing for Approximate MAP Inference with Latent Variables},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2011},
  address =      {Granada, Spain},
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11mapmarg}
}
@InProceedings{daume11multihash,
  author =       {Amit Goyal and Piyush Rai and Hal {Daum\'e III}},
  title =        {Multiple Hash Functions for Learning},
  booktitle =    {NeurIPS Big Learning Workshop},
  year =         {2011},
  address =      {Sierra Nevada, Spain},
  abstract = { 
    In this paper, we explore the idea of feature-hashing
    in learning problems. We first evaluate some hashing
    strategies on the basis of their efficacy on
    classification problems. We then explore the
    following trade-off: Given a fixed budget (say K)
    for the hashed feature vector, should one use a
    single hash function that gives a hashed vector of
    size K, or use multiple hash functions to come up
    with smaller representations (say 3 hash functions,
    each giving a representation of size K=3)? In
    particular, for the latter setting, how should the
    different hashed representations be combined? We
    propose online learning algorithms for this setting
    using multiple Perceptrons (one for each hashed
    representation), and explore a number of Perceptron
    update and prediction schemes. Experimental results
    demonstrate that our update schemes give better
    classification accuracies than the case when a
    single hashed feature vector is used to train the
    model.
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11multihash}
}
@InProceedings{daume11olmt,
  author =       {Avishek Saha and Piyush Rai and Hal {Daum\'e III} and Suresh Venkatasubramanian},
  title =        {Online Learning of Multiple Tasks and Their Relationships},
  booktitle =    {Conference on Artificial Intelligence and Statistics (AI-Stats)},
  year =         {2011},
  address =      {Ft. Lauderdale, FL},
  abstract =     {
    We propose an Online MultiTask Learning
    (OMTL) framework which simultaneously
    learns the task weight vectors as well as
    the task relatedness adaptively from the
    data. Our work is in contrast with prior
    work on online multitask learning which
    assumes fixed task relatedness, a priori.
    Furthermore, whereas prior work in such
    settings assume only positively correlated
    tasks, our framework can capture negative
    correlations as well. Our proposed
    framework learns the task relationship matrix
    by framing the objective function as
    a Bregman divergence minimization problem
    for positive definite matrices. Subsequently,
    we exploit this adaptively learned
    task-relationship matrix to select the most
    informative samples in an online multitask
    active learning setting. Experimental results
    on a number of real-world datasets
    and comparisons with numerous baselines
    establish the efficacy of our proposed approach.
  },
  keywords = {ml da},
  url = {http://pub.hal3.name/#daume11olmt}
}
@inproceedings{daume11robotic,
  title     = {A Corpus-Guided Framework for Robotic Visual Perception},
  author    = {Ching L. Teo and Yezhou Yang and Hal {Daum\'e III} and Cornelia Ferm\"uller and Yiannis Aloimonos},
  booktitle = {AAAI Workshop on Language-Action Tools for Cognitive Artificial Agents},
  year      = {2011},
  abstract  = {
    We present a framework that produces sentence-level
    summarizations of videos containing complex human activities that
    can be implemented as part of the Robot Perception Control
    Unit (RPCU). This is done via: 1) detection of pertinent objects
    in the scene: tools and direct-objects, 2) predicting actions
    guided by a large lexical corpus and 3) generating the most
    likely sentence description of the video given the detections. We
    pursue an active object detection approach by focusing on regions
    of high optical flow. Next, an iterative EM strategy, guided by
    language, is used to predict the possible actions. Finally, we
    model the sentence generation process as a HMM optimization
    problem, combining visual detections and a trained language model
    to produce a readable description of the video. Experimental
    results validate our approach and we discuss the implications of
    our approach to the RPCU in future applications.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume11robotic},
}
@InProceedings{daume11sketch,
  author =       {Amit Goyal and Hal {Daum\'e III}},
  title =        {Approximate Scalable Bounded Space Sketch for Large Data {NLP}},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2011},
  address =      {Edinburgh, Scotland},
  abstract =     {
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume11sketch}
}
@InProceedings{daume11social,
  author =       {Anusua Trivedi and Piyush Rai and Hal {Daum\'e III} and Scott L. DuVall},
  title =        {Leveraging Social Bookmarks from Partially Tagged Corpus for Improved Webpage Clustering},
  booktitle =    {ACM Transactions on Intelligent Systems and Technology},
  year =         {2011},
  abstract =     {
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11social}
}
@InProceedings{daume11sparse,
  author =       {Jagadeesh Jagarlamudi and Raghavendra Udupa and Hal {Daum\'e III} and Abhijit Bhole},
  title =        {Improving Bilingual Projections via Sparse Covariance Matrices},
  booktitle =    {Empirical Methods in Natural Language Processing (EMNLP)},
  year =         {2011},
  address =      {Edinburgh, Scotland},
  abstract =     {
  },
  keywords = {nlp},
}
@InProceedings{daume11spectral,
  author =       {Abhishek Kumar and Piyush Rai and Hal {Daum\'e III}},
  title =        {Co-regularized Multi-view Spectral Clustering},
  booktitle =    {Proceedings of the Conference on Neural Information Processing Systems (NeurIPS)},
  year =         {2011},
  address =      {Granada, Spain},
  keywords = {ml},
  url = {http://pub.hal3.name/#daume11spectral}
}
@InProceedings{daume11tradeoffs,
  author =       {Jason Eisner and Hal {Daum\'e III}},
  title =        {Speed-Accuracy Tradeoffs in Nondeterministic Inference Algorithms},
  booktitle =    {Proceedings of COST: NeurIPS 2011 Workshop on Computational Trade-offs in Statistical Learning},
  year =         {2011},
  address =      {Sierra Nevada, Spain},
  abstract =     {
    Statistical learning has led to great advances in building
    models that achieve high accuracy. However, test-time
    inference in these models can be slow, for example in
    structured prediction problems. This is frequently addressed
    by using test-time heuristics to guide and prune the search
    for a good structured output. In this high-level paper, we
    ask: Could we explicitly train such heuristics to trade off
    accuracy and efficiency? And how does this relate to existing
    learning problems?
  },
  keywords = {ml nlp},
  url = {http://pub.hal3.name/#daume11tradeoffs}
}
@InProceedings{daume11wassa,
  author =       {Amit Goyal and Hal {Daum\'e III}},
  title =        {Generating Semantic Orientation Lexicon using Large Data and Thesaurus},
  booktitle =    {Proceedings of ACL Workshop on Computational Approaches to Subjectivity and Sentiment Analysis  (WASSA)},
  year =         {2011},
  address =      {Portland, OR},
  abstract =     {
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume11wassa}
}
@InProceedings{daume12binarymkl,
  author =       {Abhishek Kumar and Alexandru Niculescu-Mizil and Koray Kavukcuoglu and Hal {Daum\'e III}},
  title =        {A Binary Classification Framework for Two-Stage Multiple Kernel Learning},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2012},
  abstract =     { 
    With the advent of kernel methods, automating the task of
    specifying a suitable kernel has become increasingly
    important. In this context, the Multiple Kernel
    Learning (MKL) problem of finding a combination of
    prespecified base kernels that is suitable for the task at
    hand has received significant attention from researchers. In
    this paper we show that Multiple Kernel Learning can be
    framed as a standard binary classification problem with
    additional constraints that ensure the positive definiteness
    of the learned kernel.  Framing MKL in this way has the
    distinct advantage that it makes it easy to leverage the
    extensive research in binary classification to develop better
    performing and more scalable MKL algorithms that are
    conceptually simpler, and, arguably, more accessible to
    practitioners. Experiments on nine data sets from different
    domains show that, despite its simplicity, the proposed
    technique compares favorably with current leading MKL
    approaches.
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume12binarymkl}
}
@InProceedings{daume12coaching,
  author =       {He He and Hal {Daum\'e III} and Jason Eisner},
  title =        {Imitation Learning by Coaching},
  booktitle =    {Neural Information Processing Systems (NeurIPS)},
  year =         {2012},
  abstract =     {
	  Imitation Learning has been shown to be successful in solving many challenging
	  real-world problems. Some recent approaches give strong performance guarantees
	  by training the policy iteratively. However, it is important to note that these
	  guarantees depend on how well the policy we found can imitate the oracle on the
	  training data. When there is a substantial difference between the oracleâ€™s ability
	  and the learnerâ€™s policy space, we may fail to find a policy that has low error
	  on the training set. In such cases, we propose to use a coach that demonstrates
	  easy-to-learn actions for the learner and gradually approaches the oracle. By a
	  reduction of learning by demonstration to online learning, we prove that coaching
	  can yield a lower regret bound than using the oracle. We apply our algorithm
	  to cost-sensitive dynamic feature selection, a hard decision problem that considers
	  a user-specified accuracy-cost trade-off. Experimental results on UCI datasets
	  show that our method outperforms state-of-the-art imitation learning methods in
	  dynamic feature selection and two static feature selection methods.
  },
  keywords =	 {imitation learning, online learning},
  url = {http://pub.hal3.name/#daume12coaching}
}

@InProceedings{daume12desctext,
  author =       {Jesse Dodge and Amit Goyal and Xufeng Han and Alyssa Mensch and Margaret Mitchell and Karl Stratos and Kota Yamaguchi and Yejin Choi and Hal  {Daum\'e III} and Alexander C. Berg and Tamara L. Berg},
  title =        {Detecting Visual Text},
  booktitle =    {North American Chapter of the Association for Computational Linguistics (NAACL)},
  year =         {2012},
  abstract =     { 
    When people describe a scene, they often include
    information that is not visually apparent;
    sometimes based on background knowledge,
    sometimes to tell a story. We aim to separate
    visual text—descriptions of what is being
    seen—from non-visual text in natural images
    and their descriptions. To do so, we first concretely
    define what it means to be visual, annotate
    visual text and then develop algorithms
    to automatically classify noun phrases as visual
    or non-visual. We find that using text
    alone, we are able to achieve high accuracies
    at this task, and that incorporating features
    derived from computer vision algorithms improves
    performance. Finally, we show that we
    can reliably mine visual nouns and adjectives
    from large corpora and that we can use these
    effectively in the classification task.
  },
  keywords = {nlp vision},
  url = {http://pub.hal3.name/#daume12desctext}
}
@inproceedings{daume12distributed,
  title     = {Efficient Protocols for Distributed Classification and Optimization},
  author    = {Hal {Daum\'e III} and Jeff M. Phillips and Avishek Saha and Suresh Venkatasubramanian},
  booktitle = {ALT},
  year      = {2012},
  abstract  = {
    In distributed learning, the goal is to perform a learning task over data distributed across multiple nodes with minimal (expensive) communication. Prior work (Daume III et al., 2012) proposes a general model that bounds the communication required for learning classifiers while allowing for $\eps$ training error on linearly separable data adversarially distributed across nodes.
    In this work, we develop key improvements and extensions to this basic model. Our first result is a two-party multiplicative-weight-update based protocol that uses $O(d^2 \log{1/\eps})$ words of communication to classify distributed data in arbitrary dimension d, $\eps$-optimally. This readily extends to classification over k nodes with $O(kd^2 \log{1/\eps})$ words of communication. Our proposed protocol is simple to implement and is considerably more efficient than baselines compared, as demonstrated by our empirical results.
    In addition, we illustrate general algorithm design paradigms for doing efficient learning over distributed data. We show how to solve fixed-dimensional and high dimensional linear programming efficiently in a distributed setting where constraints may be distributed across nodes. Since many learning problems can be viewed as convex optimization problems where constraints are generated by individual points, this models many typical distributed learning scenarios. Our techniques make use of a novel connection from multipass streaming, as well as adapting the multiplicative-weight-update framework more generally to a distributed setting. As a consequence, our methods extend to the wide range of problems solvable using these techniques. 
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume12distributed},
}
@inproceedings{daume12dynafea,
  title = {Cost-sensitive Dynamic Feature Selection},
  author = {He He and Hal {Daum\'e III} and Jason Eisner},
  booktitle =    {ICML 2012 Workshop on Interactions between Inference and Learning (Inferning)},
  year =         {2012},
  address =      {Edinburgh, Scotland},
  abstract =     {
    We present an instance-specific dynamic feature
    selection algorithm at test time, which
    sequentially chooses features given values of
    already selected features and stops to make
    a prediction according to a user-specified
    speed-accuracy trade-off. We apply imitation
    learning techniques to address the problem
    of learning and inference jointly in a simple
    multiclass classification setting. Our feature
    selection method treats the given solver (e.g.
    a classifier trained with a full set of features)
    as a black box and does not have any constraint
    on it. Experimental results show that
    using a dynamic instance-specific feature set
    can significantly improve accuracy at a low
    cost.
  },
  keywords = {ml},
  url = {http://pub.hal3.name/#daume12dynafeat}
}
@InProceedings{daume12flag,
  author = {Amit Goyal and Hal {Daum\'e III} and Raul Guerra},
  title = {Fast Large-Scale Approximate Graph Construction for {NLP}},
  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
  year = {2012},
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume12flag}
}
@InProceedings{daume12flexiblemtl,
  author =       {Alexandre Passos and Piyush Rai and Jacques Wainer and Hal {Daum\'e III}},
  title =        {Flexible Modeling of Latent Task Structures in Multitask Learning},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2012},
  address =      {Edinburgh, Scotland},
  abstract = {
    Multitask learning algorithms are typically designed assuming
    some fixed, a priori known latent structure shared by all the
    tasks. However, it is usually unclear what type of latent
    task structure is the most appropriate for a given multitask
    learning problem. Ideally, the "right" latent task structure
    should be learned in a data-driven manner. We present a
    flexible, nonparametric Bayesian model that posits a mixture
    of factor analyzers structure on the tasks. The nonparametric
    aspect makes the model expressive enough to subsume many
    existing models of latent task structures (e.g,
    meanregularized tasks, clustered tasks, low-rank or
    linear/non-linear subspace assumption on tasks,
    etc.). Moreover, it can also learn more general task
    structures, addressing the shortcomings of such models. We
    present a variational inference algorithm for our model.
    Experimental results on synthetic and realworld datasets, on
    both regression and classification problems, demonstrate the
    effectiveness of the proposed method.
  },
  keywords = {ml bayes},
  url = {http://pub.hal3.name/#daume12flexiblemtl}
}
@InProceedings{daume12gma,
  author =       {Abhishek Sharma and Abhishek Kumar and Hal {Daum\'e III} and David Jacobs},
  title =        {Generalized Multiview Analysis: A Discriminative latent space},
  booktitle =    {Computer Vision and Pattern Recognition (CVPR)},
  year =         {2012},
  abstract =     { 
    This paper presents a general multi-view feature extraction
    approach that we call Generalized Multiview Analysis or
    GMA. GMA has all the desirable properties required for
    cross-view classification and retrieval: it is supervised, it
    allows generalization to unseen classes, it is multi-view and
    kernelizable, it affords an efficient eigenvalue based
    solution and is applicable to any domain. GMA exploits the
    fact that most popular supervised and unsupervised feature
    extraction techniques are the solution of a special form of a
    quadratic constrained quadratic program (QCQP), which can be
    solved efficiently as a generalized eigenvalue problem. GMA
    solves a joint, relaxed QCQP over different feature spaces to
    obtain a single (non)linear subspace. Intuitively, GMA is a
    supervised extension of Canonical Correlational
    Analysis (CCA), which is useful for cross-view classification
    and retrieval. The proposed approach is general and has the
    potential to replace CCA whenever classification or retrieval
    is the purpose and label information is available. We
    outperform previous approaches for text-image retrieval on
    Pascal and Wiki text-image data. We report state-of-the-art
    results for pose and lighting invariant face recognition on
    the MultiPIE face dataset, significantly outperforming other
    approaches.
  },
  keywords = {vision},
  url = {http://pub.hal3.name/#daume12gma}
}
@InProceedings{daume12gomtl,
  author =       {Abhishek Kumar and Hal {Daum\'e III}},
  title =        {Learning Task Grouping and Overlap in Multi-task Learning},
  booktitle =    {International Conference on Machine Learning (ICML)},
  year =         {2012},
  abstract =     { 
    In the paradigm of multi-task learning, multiple related
    prediction tasks are learned jointly, sharing information
    across the tasks.  We propose a framework for multi-task
    learning that enables one to selectively share the
    information across the tasks. We assume that each task
    parameter vector is a linear combination of a finite number
    of underlying basis tasks. The coefficients of the linear
    combination are sparse in nature and the overlap in the
    sparsity patterns of two tasks controls the amount of sharing
    across these. Our model is based on the assumption that task
    parameters within a group lie in a low dimensional subspace
    but allows the tasks in different groups to overlap with each
    other in one or more bases. Experimental results on four
    datasets show that our approach outperforms competing
    methods.
  },
  keywords = {ml da},
  url = {http://pub.hal3.name/#daume12gomtl}
}
@InProceedings{daume12importance,
  author =       {Karl Stratos and Aneesh Sood and Alyssa Mensch and Xufeng Han and Margaret Mitchell and Kota Yamaguchi and Jesse Dodge and Amit Goyal and Hal {Daum\'e III} and Alexander C. Berg and Tamara L. Berg},
  title =        {Understanding and Predicting Importance in Images},
  booktitle =    {Computer Vision and Pattern Recognition (CVPR)},
  year =         {2012},
  abstract =     { 
    What do people care about in an image? To drive
    computational visual recognition toward more
    human-centric outputs, we need a better
    understanding of how people perceive and judge the
    importance of content in images. In this paper, we
    explore how a number of factors relate to human
    perception of importance. Proposed factors fall into
    3 broad types: 1) factors related to composition,
    e.g. size, location, 2) factors related to
    semantics, e.g. category of object or scene, and 3)
    contextual factors related to the likelihood of
    attribute-object, or object-scene pairs. We explore
    these factors using what people describe as a proxy
    for importance.  Finally, we build models to predict
    what will be described about an image given either
    known image content, or image content estimated
    automatically by recognition systems.
  },
  keywords = {nlp vision},
  url = {http://pub.hal3.name/#daume12importance}
}
@inproceedings{daume12lowdim,
  title = {Low-dimensional Discriminative Reranking},
  author = {Jagadeesh Jagarlamudi and Hal {Daum\'e III}},
  booktitle =    {Proceedings of the Conference on North American Chapter of the Association for Computational Linguistics},
  year =         {2012},
  address =      {Montreal, Canada},
  abstract =     {
    The accuracy of many natural language processing tasks can be
    improved by a reranking step, which involves selecting a single
    output from a list of candidate outputs generated by a baseline
    system. We propose a novel family of reranking algorithms based on
    learning separate low-dimensional embeddings of the task’s input
    and output spaces. This embedding is learned in such a way that
    prediction becomes a low-dimensional nearest-neighbor search,
    which can be done computationally efﬁciently. A key quality of our
    approach is that feature engineering can be done separately on the
    input and output spaces; the relationship between inputs and
    outputs is learned automatically. Experiments on part-of-speech
    tagging task in four languages show signiﬁcant improvements over a
    baseline decoder and existing reranking approaches.
  },
  keywords = {nlp ml},
  url = {http://pub.hal3.name/#daume12lowdim}
}
@InProceedings{daume12midge,
  author =       {Margaret Mitchell and Jesse Dodge and Amit Goyal and Kota Yamaguchi and Karl Stratos and Xufeng Han and Alyssa Mensch and Alexander C. Berg and Tamara L. Berg and Hal {Daum\'e III}},
  title =        {Midge: Generating Image Descriptions From Computer Vision Detections},
  booktitle =    {European Chapter of the Association for Computational Linguistics (EACL)},
  year =         {2012},
  abstract =     {
    This paper introduces a novel generation
    system that composes humanlike descriptions
    of images from computer vision detections.
    By leveraging syntactically informed
    word co-occurrence statistics, the
    generator filters and constrains the noisy
    detections output from a vision system to
    generate syntactic trees that detail what
    the computer vision system sees. Results
    show that the generation system outperforms
    state-of-the-art systems, automatically
    generating some of the most natural
    image descriptions to date.
  },
  keywords = {nlp vision},
  award = {Test of Time Award (2022)},
  url = {http://pub.hal3.name/#daume12midge}
}
@InProceedings{daume12pointquery,
  author = {Amit Goyal and Hal {Daum\'e III} and Graham Cormode},
  title = {Sketch Algorithms for Estimating Point Queries in {NLP}},
  booktitle = {Empirical Methods in Natural Language Processing (EMNLP)},
  year = {2012},
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume12pointquery}
 }
@inproceedings{daume12prioritization,
  title     = {Learned Prioritization for Trading Off Accuracy and Speed},
  author    = {Jiarong Jiang and Adam Teichert and Hal {Daum\'e III} and Jason Eisner},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year      = {2012},
  abstract  = {
    Users want inference to be both fast and accurate, but quality often comes at
    the cost of speed. The ﬁeld has experimented with approximate inference algorithms that make different speed-accuracy tradeoffs (for particular problems and
    datasets). We aim to explore this space automatically, focusing here on the case of
    agenda-based syntactic parsing [12]. Unfortunately, off-the-shelf reinforcement
    learning techniques fail to learn good policies: the state space is simply too large
    to explore naively. An attempt to counteract this by applying imitation learning
    algorithms also fails: the “teacher” follows a far better policy than anything in our
    learner’s policy space, free of the speed-accuracy tradeoff that arises when oracle information is unavailable, and thus largely insensitive to the known reward
    functﬁon. We propose a hybrid reinforcement/apprenticeship learning algorithm
    that learns to speed up an initial policy, trading off accuracy for speed according
    to various settings of a speed term in the loss function.
  },
  keywords  = {ml nlp},
  url       = {http://pub.hal3.name/#daume12prioritization},
}
@inproceedings{daume12protocols,
  title     = {Protocols for Learning Classifiers on Distributed Data},
  author    = {Hal {Daum\'e III} and Jeff Phillips and Avishek Saha and Suresh Venkatasubramanian},
  booktitle = {Proceedings of the Workshop on Artificial Intelligence and Statistics (AI-Stats)},
  year      = {2012},
  abstract  = {
    We consider the problem of learning classifiers for labeled data
    that has been distributed across several nodes. Our goal is to
    find a single classifier, with small approximation error, across
    all datasets while minimizing the communication between
    nodes. This setting models real-world communication bottlenecks
    in the processing of massive distributed datasets. We present
    several very general sampling-based solutions as well as two-way
    protocols which have a provable exponential speed-up over any
    one-way protocol. We focus on core problems for noise-less data
    distributed across two or more nodes. The techniques we introduce
    are reminiscent of active learning, but rather than actively
    probing labels, nodes actively communicate with each other, each
    node simultaneously learning important data from another node.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume12protocols},
}
@inproceedings{daume12quiz,
  title     = {Besting the quiz master: crowdsourcing incremental classification games},
  author    = {Jordan Boyd-Graber and Brianna Satinoff and He He and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2012},
  abstract  = {
    Cost-sensitive classification, where the
    features
    used in machine learning tasks have a cost, has
    been explored as a means of balancing knowl-
    edge against the expense of incrementally ob-
    taining new features. We introduce a setting
    where humans engage in classification with
    incrementally revealed features: the collegiate
    trivia circuit. By providing the community with
    a web-based system to practice, we collected
    tens of thousands of implicit word-by-word
    ratings of how useful features are for eliciting
    correct answers. Observing humans’ classifi-
    cation process, we improve the performance
    of a state-of-the art classifier. We also use the
    dataset to evaluate a system to compete in the
    incremental classification task through a reduc-
    tion of reinforcement learning to classification.
    Our system learns
    when
    to answer a question,
    performing better than baselines and most hu-
    man players.
  },
  keywords  = {ml nlp},
  url       = {http://pub.hal3.name/#daume12quiz},
}
@inproceedings{daume12seeded,
  title = {Incorporating Lexical Priors into Topic Models},
  author = {Jagadeesh Jagarlamudi and Hal {Daum\'e III} and Raghavendra Udupa},
  booktitle =    {Proceedings of the Conference on European Chapter of the Association for Computational Linguistics (EACL)},
  year =         {2012},
  address =      {Avignon, France},
  abstract =     {
    Topic models have great potential for helping users understand
    document corpora. This potential is stymied by their purely
    unsupervised nature, which often leads to topics that are neither
    entirely meaningful nor effective in extrinsic tasks (Chang et
    al., 2009). We propose a simple and effective way to guide topic
    models to learn topics of speciﬁc interest to a user. We achieve
    this by providing sets of seed words that a user believes are
    representative of the underlying topics in a corpus. Our model
    uses these seeds to improve both topicword distributions (by
    biasing topics to produce appropriate seed words) and to improve
    document-topic distributions (by biasing documents to select
    topics related to the seed words they contain). Extrinsic
    evaluation on a document clustering task reveals a signiﬁcant
    improvement when using seed information, even over other models
    that use seed information navely.
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume12seeded}
}
@inproceedings{daume12tasks,
  title     = {Simultaneously Leveraging Output and Task Structures for Multiple-Output Regression},
  author    = {Piyush Rai and Abhishek Kumar and Hal {Daum\'e III}},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year      = {2012},
  abstract  = {
    Multiple-output regression models require estimating multiple parameters, one for
    each output. Structural regularization is usually employed to improve parameter
    estimation in such models. In this paper, we present a multiple-output regression
    model that leverages the covariance structure of the latent model parameters as
    well as the conditional covariance structure of the observed outputs. This is in
    contrast with existing methods that usually take into account only one of these
    structures. More importantly, unlike some of the other existing methods, none of
    these structures need be known a priori in our model, and are learned from the
    data. Several previously proposed structural regularization based multiple-output
    regression models turn out to be special cases of our model. Moreover, in addition
    to being a rich model for multiple-output regression, our model can also be used in
    estimating the graphical model structure of a set of variables (multivariate outputs)
    conditioned on another set of variables (inputs). Experimental results on both
    synthetic and real datasets demonstrate the effectiveness of our method.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume12tasks},
}
@InProceedings{daume12transliterate,
  author    = {Jagadeesh Jagarlamudi and Hal {Daum\'e III}},
  title     = {Regularized Interlingual Projections: Evaluation on Multilingual Transliteration},
  booktitle = {Proceedings of the 2012 Joint Conference on Empirical Methods in Natural Language Processing and Computational Natural Language Learning},
  month     = {July},
  year      = {2012},
  address   = {Jeju Island, Korea},
  publisher = {Association for Computational Linguistics},
  pages     = {12--23},
  url       = {http://www.aclweb.org/anthology/D12-1002}
}
@inproceedings{daume12watson,
  title     = {Towards a Watson That Sees: Language-Guided Action Recognition for Robots},
  author    = {Ching Lik Teo and Yezhou Yang and Hal {Daum\'e III} and Cornelia Ferm\"uller and Yiannis Aloimonos},
  booktitle = {ICRA},
  year      = {2012},
  abstract  = {
    For robots of the future to interact seamlessly with
    humans, they must be able to reason about their surroundings
    and take actions that are appropriate to the situation. Such
    reasoning is only possible when the robot has knowledge of how
    the World functions, which must either be learned or hardcoded. In this paper, we propose an approach that exploits
    language as an important resource of high-level knowledge
    that a robot can use, akin to IBM’s Watson in Jeopardy!. In
    particular, we show how language can be leveraged to reduce
    the ambiguity that arises from recognizing actions involving
    hand-tools from video data. Starting from the premise that
    tools and actions are intrinsically linked, with one explaining
    the existence of the other, we trained a language model over a
    large corpus of English newswire text so that we can extract
    this relationship directly. This model is then used as a prior
    to select the best tool and action that explains the video. We
    formalize the approach in the context of 1) an unsupervised
    recognition and 2) a supervised classiﬁcation scenario by an EM
    formulation for the former and integrating language features
    for the latter. Results are validated over a new hand-tool action
    dataset, and comparisons with state of the art STIP features
    showed signiﬁcantly improved results when language is used. In
    addition, we discuss the implications of these results and how
    it provides a framework for integrating language into vision on
    other robotic applications.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume12watson},
}
@inproceedings{daume13bushy,
  title     = {Binary to Bushy: Bayesian Hierarchical Clustering with the Beta Coalescent},
  author    = {Yuening Hu and Jordan Boyd-Graber and Hal {Daum\'e III} and Z. Irene Ying},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year      = {2013},
  abstract  = {
    Discovering hierarchical regularities in data is a key problem in
    interacting with large datasets, modeling cognition, and encoding
    knowledge. A previous Bayesian solution—Kingman’s
    coalescent—provides a probabilistic model for data represented as
    a binary tree. Unfortunately, this is inappropriate for data
    better described by bushier trees. We generalize an existing
    belief propagation framework of Kingman’s coalescent to the beta
    coalescent, which models a wider range of tree
    structures. Because of the complex combinatorial search over
    possible structures, we develop new sampling schemes using
    sequential Monte Carlo and Dirichlet process mixture models,
    which render inference efficient and tractable. We present
    results on synthetic and real data that show the beta coalescent
    outperforms Kingman’s coalescent and is qualitatively better at
    capturing data in bushy hierarchies.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume13bushy},
}
@inproceedings{daume13clinical,
  title     = {Improving performance of natural language processing part-of-speech tagging on clinical narratives through domain adaptation},
  author    = {Jeffrey Ferraro and Hal {Daum\'e III} and Scott DuVall and Wendy Chapman and Henk Harkema and Peter Haug},
  booktitle = {Jornal of the American Medical Informatics Association},
  year      = {2013},
  abstract  = {
    Natural language processing (NLP) tasks are commonly decomposed into subtasks, chained together to form processing pipelines. The residual error produced in these subtasks propagates, adversely affecting the end objectives. Limited availability of annotated clinical data remains a barrier to reaching state-of-the-art operating characteristics using statistically based NLP tools in the clinical domain. Here we explore the unique linguistic constructions of clinical texts and demonstrate the loss in operating characteristics when out-of-the-box part-of-speech (POS) tagging tools are applied to the clinical domain. We test a domain adaptation approach integrating a novel lexical-generation probability rule used in a transformation-based learner to boost POS performance on clinical narratives.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume13clinical},
}
@inproceedings{daume13depfeat,
  title     = {Dynamic Feature Selection for Dependency Parsing},
  author    = {He He and Hal {Daum\'e III} and Jason Eisner},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2013},
  abstract  = {
    Feature computation and exhaustive search
    have signiﬁcantly restricted the speed of
    graph-based dependency parsing. We propose
    a faster framework of dynamic feature selection, where features are added sequentially as
    needed, edges are pruned early, and decisions
    are made online for each sentence. We model
    this as a sequential decision-making problem
    and solve it by imitation learning techniques.
    We test our method on 7 languages. Our dynamic parser can achieve accuracies comparable or even superior to parsers using a full set
    of features, while computing fewer than 30%
    of the feature templates.
  },
  keywords  = {nlp ml},
  url       = {http://pub.hal3.name/#daume13depfeat},
}
@inproceedings{daume13detm,
  title     = {Discriminatively Enhanced Topic Models},
  author    = {Snigdha Chaturvedi and Hal {Daum\'e III} and Taesun Moon},
  booktitle = {International Conference on Data Mining (ICDM)},
  year      = {2013},
  abstract  = {
  },
  keywords  = {ml nlp},
  url       = {http://pub.hal3.name/#daume13detm},
}
@inproceedings{daume13dialogoutcomes,
  title     = {Predicting Dialogue Outcomes over Structured Latent Representations},
  author    = {Dan Goldwasser and Hal {Daum\'e III}},
  booktitle = {NeurIPS Workshop on Output Representation Learning},
  year      = {2013},
  abstract  = {
  },
  keywords  = {nlp, ml},
  url       = {http://pub.hal3.name/#daume13dialogoutcomes},
}
@inproceedings{daume13dvh,
  title     = {Predictable Dual-View Hashing},
  author    = {Mohammad Rastegari and Jonghyun Choi and Shobeir Fakhraei and Hal {Daum\'{e} III} and Larry S. Davis},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2013},
  abstract  = {
    We propose a Predictable Dual-View Hashing (PDH) algorithm which embeds proximity of data samples in the original spaces. We
    create a cross-view hamming space with the
    ability to compare information from previously incomparable domains with a notion of
    ‘predictability’. By performing comparative
    experimental analysis on two large datasets,
    PASCAL-Sentence and SUN-Attribute, we
    demonstrate the superiority of our method
    to the state-of-the-art dual-view binary code
    learning algorithms.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume13dvh},
}
@inproceedings{daume13engagementmooc,
  title     = {Modeling Learner Engagement in MOOCs using Probabilistic Soft Logic},
  author    = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal {Daum\'e III} and Lise Getoor},
  booktitle = {NeurIPS Workshop on Data Driven Education},
  year      = {2013},
  abstract  = {
    Massive open online courses (MOOCs) attract a large number of
    student registrations, but recent studies have shown that only a
    small fraction of these students complete their courses. Student
    dropouts are thus a major deterrent for the growth and success of
    MOOCs. We believe that understanding student engagement as a
    course progresses is essential for minimizing dropout
    rates. Formally defining student engagement in an online setting
    is challenging. In this paper, we leverage activity (such as
    posting in discussion forums, timely submission of assignments,
    etc.), linguistic features from forum content and structural
    features from forum interaction to identify two different forms
    of student engagement (passive and active) in MOOCs. We use
    probabilistic soft logic (PSL) to model student engagement by
    capturing domain knowledge about student interactions and
    performance. We test our models on MOOC data from Coursera and
    demonstrate that modeling engagement is helpful in predicting
    student performance.
  },
  keywords  = {nlp, mooc},
  url       = {http://pub.hal3.name/#daume13engagementmooc},
}
@inproceedings{daume13graphkernel,
  title     = {A Topical Graph Kernel for Link Prediction in Labeled Graphs},
  author    = {Snigdha Chaturvedi and Hal {Daum\'e III} and Taesun Moon and Shashank Srivastava},
  booktitle = {ICML workshop on Mining and Learning with Graphs (MLG)},
  year      = {2013},
  abstract  = {
    This paper proposes a solution to the problem of link prediction in labeled graphs with
    additional text information associated with
    the nodes. By ﬁtting a topic model on the
    text corpus and some processing, we compute
    the topics of interest to a node. We propose a
    walk based graph kernel which incorporates
    the node’s interest and thus represents structural as well as textual information. We then
    make predictions about the existence of unseen links using a kernelized SVM. Our experiments with an author citation network
    shows that our method is eﬀective and signiﬁcantly outperforms a network-oriented approach.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume13graphkernel},
}
@inproceedings{daume13hrtf,
  title     = {Kernel  Regression for Head-Related Transfer Function Interpolation and Spectral Extrema Extraction},
  author    = {Yuancheng Luo and Dmitry N. Zotkin and Hal {Daum\'e III} and Ramani Duraiswami},
  booktitle = {Proceedings of the IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
  year      = {2013},
  abstract  = {
    Head-Related Transfer Function (HRTF) representation and interpolation is an important problem in spatial audio. We present a kernel regression method based on Gaussian process (GP) modeling
    of the joint spatial-frequency relationship between HRTF measurements and obtain a smooth non-linear representation based on data
    measured over both arbitrary and structured spherical measurement
    grids. This representation is further extended to the problem of extracting spectral extrema (notches and peaks). We perform HRTF
    interpolation and spectral extrema extraction using freely available
    CIPIC HRTF data. Experimental results are shown.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume13hrtf},
}
@inproceedings{daume13mm,
  title     = {Monolingual Marginal Matching for Translation Model Adaptation},
  author    = {Ann Irvine and Chris Quirk and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2013},
  abstract  = {
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume13mm},
}
@article{daume13mterrors,
  title     = {Measuring Machine Translation Errors in New Domains},
  author    = {Ann Irvine and John Morgan and Marine Carpuat and Hal {Daum\'e III} and Dragos Munteanu},
  journal   = {Transactions of the Association for Computational Linguistics (TACL)},
  year      = {2013},
  abstract  = {
    We develop two techniques for analyzing the effect of porting a machine translation system to a new domain. One is a macro-level analysis that measures how domain shift affects corpus-level evaluation; the second is a micro-level analysis for word-level errors. We apply these methods to understand what happens when a Parliament-trained phrase-based machine translation system is applied in four very different domains: news, medical texts, scientific articles and movie subtitles. We present quantitative and qualitative experiments that highlight opportunities for future research in domain adaptation for machine translation.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume13mterrors},
}
@inproceedings{daume13pabp,
  title     = {Prioritized Asynchronous Belief Propagation},
  author    = {Jiarong Jiang and Taesun Moon and Hal {Daum\'e III} and Jason Eisner},
  booktitle = {ICML Workshop on Inferning},
  year      = {2013},
  abstract  = {
    Message scheduling is shown to be very effective
    in belief propagation (BP) algorithms. However, most existing scheduling algorithms use ﬁxed heuristics regardless of the structure of the graphs or properties of the distribution. On the other
    hand, designing diﬀerent scheduling heuristics for all graph structures are not feasible.
    In this paper, we propose a reinforcement
    learning based message scheduling framework
    (RLBP) to learn the heuristics automatically
    which generalizes to any graph structures and
    distributions. In the experiments, we show
    that the learned problem-speciﬁc heuristics
    largely outperform other baselines in speed.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume13pabp},
}
@article{daume13plotunits,
  author =       {Amit Goyal and Ellen Riloff and Hal {Daum\'e III}},
  title =        {A Computational Model for Plot Units},
  journal =      {Computational Intelligence Journal},
  year =         {2013},
  volume =       {29},
  number =       {3},
  abstract = {
    This research revisits plot units, which were developed
    in the 1980s as a conceptual knowledge structure to
    represent the affect states of and emotional
    tensions between characters in narrative stories. We
    present a fully automated system, called AESOP, that
    generates plot unit representations for narrative
    texts. AESOP performs four steps: affect state
    recognition, character identification, affect state
    projection, and link creation. We also identify a
    type of knowledge that seems to be missing from
    existing lexical resources: verbs that impart
    positive or negative polarity onto their patients
    (e.g., “eat” imparts negative polarity because being
    eaten is bad, whereas “fed” imparts positive
    polarity because being fed is good). We develop two
    techniques to automatically harvest these “patient
    polarity verbs” (PPVs) from a Web corpus, and show
    that the PPVs improve affect state
    recognition. Finally, we evaluate AESOP’s
    performance on a set of fables, and present several
    analyses to shed light on the capabilities and
    limitations of current natural language processing
    technology for plot unit generation.
  },
  keywords = {nlp},
  url = {http://pub.hal3.name/#daume13plotunits}
}
@inproceedings{daume13semanticmt,
  title     = {Modeling Syntactic and Semantic Structures in Hierarchical Phrase-based Translation},
  author    = {Junhui Li and Philip Resnik and and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2013},
  abstract  = {
    Incorporating semantic structure into a linguistics-free
    translation model is challenging, since semantic structures are
    closely tied to syntax. In this paper, we propose a two-level
    approach to exploiting predicate-argument structure reordering in
    a hierarchical phrase-based translation model. First, we
    introduce linguistically motivated constraints into a
    hierarchical model, guiding translation phrase choices in favor
    of those that respect syntactic boundaries. Second, based on such
    translation phrases, we propose a predicate-argument structure
    reordering model that predicts reordering not only between an
    argument and its predicate, but also between two
    arguments. Experiments on Chinese-to-English translation
    demonstrate that both advances significantly improve translation
    accuracy.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume13semanticmt},
}
@inproceedings{daume13sensespotting,
  title     = {{SenseSpotting}: Never let your parallel data tie you to an old domain},
  author    = {Marine Carpuat and Hal {Daum\'e III} and Katharine Henry and Ann Irvine and Jagadeesh Jagarlamudi and Rachel Rudinger},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2013},
  abstract  = {
    Words often gain new senses in new domains. Being able to automatically identify, from a corpus of monolingual text,
    which word tokens are being used in a previously unseen sense has applications to
    machine translation and other tasks sensitive to lexical semantics. We deﬁne a task,
    S ENSE S POTTING, in which we build systems to spot tokens that have new senses
    in new domain text. Instead of difﬁcult
    and expensive annotation, we build a goldstandard by leveraging cheaply available
    parallel corpora, targeting our approach to
    the problem of domain adaptation for machine translation. Our system is able to
    achieve F-measures of as much as 80%,
    when applied to word types it has never
    seen before. Our approach is based on
    a large set of novel features that capture
    varied aspects of how words change when
    used in new domains.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume13sensespotting},
}
@inproceedings{daume14deepqa,
  title     = {A Neural Network for Factoid Question Answering over Paragraphs},
  author    = {Mohit Iyyer and Jordan Boyd-Graber and Leonardo Claudino and Richard Socher and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2014},
  abstract  = {
    Text classification methods for tasks like factoid question
    answering typically use manually defined string matching
    rules or bag of words representations.  These methods are
    ineffective when question text contains very few individual
    words (e.g., named entities) that are indicative of the
    answer. We introduce a recursive neural network (RNN) model
    that can reason over such input by modeling textual
    compositionality. We apply our model, QANTA, to a dataset of
    questions from a trivia competition called quiz bowl. Unlike
    previous RNN models, QANTA learns word and phrase-level
    representations that combine across sentences to reason about
    entities. The model outperforms multiple baselines and, when
    combined with information retrieval methods, rivals the best
    human players.
  },
  keywords  = {nlp, ml},
  url       = {http://pub.hal3.name/#daume14deepqa},
}
@inproceedings{daume14iobject,
  title     = {``I Object!'' Modeling Latent Pragmatic Effects in Courtroom Dialogues},
  author    = {Dan Goldwasser and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the European Association for Computational Linguistics (EACL)},
  year      = {2014},
  abstract  = {
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume14iobject},
}
@inproceedings{daume14lts,
  title     = {Efficient programmable learning to search},
  author    = {Kai-Wei Chang and Hal {Daum\'e III} and John Langford and St\'ephane Ross},
  booktitle = {NeurIPS},
  year      = {2014},
  abstract  = {
    We improve ``learning to search'' approaches to structured prediction
    in two ways. First, we show that the search space can be defined by an
    arbitrary imperative program, reducing the number of lines of code
    required to develop new structured prediction tasks by orders of
    magnitude. Second, we make structured prediction orders of magnitude
    faster through various algorithmic improvements.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume14lts},
}
@inproceedings{daume14ltsbb,
  title     = {Learning to search in branch and bound algorithms},
  author    = {He He and Hal {Daum\'e III} and Jason M. Eisner},
  booktitle = {NeurIPS},
  year      = {2014},
  abstract  = {
    Branch-and-bound is a widely used method in combinatorial optimization, including mixed integer programming, structured prediction and MAP inference. While most work has been focused on developing problem-specific techniques, little is known about how to systematically design the node searching strategy on a branch-and-bound tree. We address the key challenge of learning an adaptive node searching order for any class of problem solvable by branch-and-bound. Our strategies are learned by imitation learning. We apply our algorithm to linear programming based branch-and-bound for solving mixed integer programs (MIP). We compare our method with one of the fastest open-source solvers, SCIP; and a very efficient commercial solver, Gurobi. We demonstrate that our approach achieves better solutions faster on four MIP libraries.
    Branch-and-bound is a widely used method in combinatorial optimization, including mixed integer programming, structured prediction and MAP inference. While most work has been focused on developing problem-specific techniques, little is known about how to systematically design the node searching strategy on a branch-and-bound tree. We address the key challenge of learning an adaptive node searching order for any class of problem solvable by branch-and-bound. Our strategies are learned by imitation learning. We apply our algorithm to linear programming based branch-and-bound for solving mixed integer programs (MIP). We compare our method with one of the fastest open-source solvers, SCIP; and a very efficient commercial solver, Gurobi. We demonstrate that our approach achieves better solutions faster on four MIP libraries.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume14ltsbb},
}
@inproceedings{daume14moocengagement,
  title     = {Uncovering Hidden Engagement Patterns for Predicting Learner Performance in MOOCs},
  author    = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal {Daum\'e III} and Lise Getoor},
  booktitle = {Learning at Scale},
  year      = {2014},
  abstract  = {
    Maintaining and cultivating student engagement is a prerequisite
    for MOOCs to have broad educational impact. Understanding student
    engagement as a course progresses helps characterize student
    learning patterns and can aid in minimizing dropout rates,
    initiating instructor intervention. In this paper, we construct a
    probabilistic model connecting student behavior and class
    performance, formulating student engagement types as latent
    variables. We show that our model identifies course success
    indicators that can be used by instructors to initiate
    interventions and assist students.
  },
  keywords  = {mooc, ml},
  url       = {http://pub.hal3.name/#daume14moocengagement},
}
@inproceedings{daume14moocintervention,
  title     = {Predicting Instructor Intervention in MOOC Forums},
  author    = {Snigdha Chaturvedi and Dan Goldwasser and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2014},
  abstract  = {
    Instructor intervention in student discussion forums is a vital
    component in Massive Open Online Courses (MOOCs), where
    personalized interaction is limited. This paper introduces the
    problem of predicting instructor interventions in MOOC forums. We
    propose several prediction models designed to capture unique
    aspects of MOOCs, combining course information, forum structure
    and posts content. Our models abstract contents of individual
    posts of threads using latent categories, learned jointly with
    the binary intervention prediction problem. Experiments over data
    from two Coursera MOOCs demonstrate that incorporating the
    structure of threads into the learning problem leads to better
    predictive performance.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume14moocintervention},
}
@inproceedings{daume14mooclearner,
  title     = {Learning Latent Engagement Patterns of Students in Online Courses},
  author    = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal {Daum\'e III} and Lise Getoor},
  booktitle = {Proceedings of the National Conference on Artificial Intelligence (AAAI)},
  year      = {2014},
  abstract  = {
    Maintaining and cultivating student engagement is critical for
    learning. Understanding factors affecting student engagement will
    help in designing better courses and improving student
    retention. The large number of participants in massive open
    online courses (MOOCs) and data collected from their interaction
    with the MOOC open up avenues for studying student engagement at
    scale. In this work, we develop a framework for modeling and
    understanding student engagement in online courses based on
    student behavioral cues. Our ﬁrst contribution is the abstraction
    of student engagement using latent representations. We use that
    abstraction in a probabilistic model to connect student behavior
    with course completion. We demonstrate that the latent
    formulation for engagement helps in predicting student survival
    across three MOOCs. Next, in order to initiate better instructor
    interventions, we need to be able to predict student survival
    early in the course. We demonstrate that we can predict student
    survival early in the course reliably using the latent
    model. Finally, we perform a closer quantitative analysis of user
    interaction with the MOOC and identify student activities that
    are good indicators for survival at different points in the
    course.
  },
  keywords  = {mooc, ml},
  url       = {http://pub.hal3.name/#daume14mooclearner},
}
@inproceedings{daume14seededmooc,
  title     = {Understanding {MOOC} Discussion Forums using seeded {LDA}},
  author    = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal {Daum\'e III} and Lise Getoor},
  booktitle = {Workshop on Innovative Use of NLP for Building Educational Applications},
  year      = {2014},
  abstract  = {
    Discussion forums serve as a platform for student discussions in massive open online courses (MOOCs). Analyzing content in these forums can uncover useful information for improving student retention and help in initiating instructor intervention. In this work, we explore the use of topic models, particularly seeded topic models toward this goal. We demonstrate that features derived from topic analysis help in predicting student survival.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume14seededmooc},
}
@inproceedings{daume14simultaneousmt,
  title     = {Don't Until the Final Verb Wait: Reinforcement Learning for Simultaneous Machine Translation},
  author    = {Alvin Grissom II and Jordan Boyd-Graber and He He and John Morgan and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2014},
  abstract  = {
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume14simultaneousmt},
}
@inproceedings{daume14vwmoses,
  title     = {Integrating a Discriminative Classifier into Phrase-based and Hierarchical Decoding},
  author    = {Ale\v{s} Tamchyna and Fabienne Braune and Alexander Fraser and Marine Carpuat and Hal {Daum\'e III} and Chris Quirk},
  booktitle = {The Prague Bulletin of Mathematical Linguistics},
  year      = {2014},
  abstract  = {
    Current state-of-the-art statistical machine translation (SMT) relies on simple feature functions which make independence assumptions at the level of phrases or hierarchical rules. However, it is well-known that discriminative models can beneﬁt from rich features extracted from
    the source sentence context outside of the applied phrase or hierarchical rule, which is available at decoding time. We present a framework for the open-source decoder Moses that allows
    discriminative models over source context to easily be trained on a large number of examples
    and then be included as feature functions in decoding.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume14vwmoses},
}
@inproceedings{daume15da,
  title     = {Guest Editor’s Introduction to the Special Issue on Domain Adaptation for Vision Applications},
  author    = {Dong Xu and Rama Chellappa and Trevor Darrell and Daum\'e, III, Hal},
  booktitle = {IJCV},
  year      = {2015},
  abstract  = {
  },
  keywords  = {vision},
  url       = {http://pub.hal3.name/#daume15da},
}
@inproceedings{daume15dan,
  title     = {Deep unordered composition rivals syntactic methods for text classification},
  author    = {Mohit Iyyer and Varun Manjunatha and Jordan Boyd-Graber and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2015},
  abstract  = {
    Many  existing  deep  learning  models  for
    natural language processing tasks focus on
    learning the
    compositionality
    of their in-
    puts, which requires many expensive com-
    putations. We present a simple deep neural
    network that competes with and, in some
    cases,  outperforms  such  models  on  sen-
    timent  analysis  and  factoid  question  an-
    swering tasks while taking only a fraction
    of the training time.  While our model is
    syntactically-ignorant, we show significant
    improvements over previous bag-of-words
    models by deepening our network and ap-
    plying a novel variant of dropout.  More-
    over, our model performs better than syn-
    tactic models on datasets with high syn-
    tactic variance.  We show that our model
    makes similar errors to syntactically-aware
    models, indicating that for the tasks we con-
    sider, nonlinearly transforming the input is
    more important than tailoring a network to
    incorporate word order and syntax.
  },
  keywords  = {nlp ml},
  url       = {http://pub.hal3.name/#daume15dan},
}
@inproceedings{daume15inverse,
  title     = {On Correcting inputs: Inverse Optimization for Online Structured Prediction},
  author    = {Hal {Daum\'e III} and Samir Khuller and Manish Purohit and Gregory Sanders},
  booktitle = {FSTTCS},
  year      = {2015},
  abstract  = {
    Algorithm designers typically assume that the input data is
    correct, and then proceed to ﬁnd “optimal” or “sub-optimal” solutions
    using this input data. However this assumption of correct data does not
    always hold in practice, especially in the context of online learning systems where the objective is to learn appropriate feature weights given
    some training samples. Such scenarios necessitate the study of inverse
    optimization problems where one is given an input instance as well as
    a desired output and the task is to adjust the input data so that the
    given output is indeed optimal. Motivated by learning structured prediction models, in this paper we consider inverse optimization with a
    margin, i.e., we require the given output to be better than all other feasible outputs by a desired margin. We consider such inverse optimization
    problems for maximum weight matroid basis, matroid intersection, perfect matchings, minimum cost maximum ﬂows, and shortest paths and
    derive the ﬁrst known results for such problems with a non-zero margin.
    The eﬀectiveness of these algorithmic approaches to online learning for
    structured prediction is also discussed.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume15daume15inverse},
}
@inproceedings{daume15lols,
  title     = {Learning to search better than your teacher},
  author    = {Kai-Wei Chang and Akshay Krishnamurthy and Alekh Agarwal and Hal {Daum\'e III} and John Langford},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2015},
  abstract  = {
    Methods for learning to search for structured prediction typically imitate a reference policy, with existing theoretical guarantees demonstrating low regret compared to that reference. This is unsatisfactory in many applications
    where the reference policy is suboptimal and the goal of learning is to improve upon it. Can learning to search
    work even when the reference is poor?
    We provide a new learning to search algorithm, LOLS, which does well relative to the reference policy, but
    additionally guarantees low regret compared to deviations from the learned policy: a local-optimality guarantee.
    Consequently, LOLS can improve upon the reference policy, unlike previous algorithms. This enables us to
    develop structured contextual bandits, a partial information structured prediction setting with many potential
    applications.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume15lols},
}
@inproceedings{daume15reductions,
  title     = {Learning Reductions that Really Work},
  author    = {Alina Beygelzimer and Hal {Daum\'e III} and John Langford and Paul Mineiro},
  booktitle = {IEEE Proceedings},
  year      = {2015},
  abstract  = {
    We provide a summary of the mathematical and computational techniques that have enabled
    learning reductions to eﬀectively address a wide class of problems, and show that this approach
    to solving machine learning problems can be broadly useful.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume15reductions},
}
@inproceedings{daume15referring,
  title     = {Why discourse affects speakers’ choice of referring expressions},
  author    = {Naho Orita and Eliana Vornov and Naomi H Feldman and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2015},
  abstract  = {
    We propose a language production model
    that uses dynamic discourse information
    to account for speakers’ choices of referring expressions. Our model extends previous rational speech act models (Frank
    and Goodman, 2012) to more naturally distributed linguistic data, instead of assuming
    a controlled experimental setting. Simulations show a close match between speakers’
    utterances and model predictions, indicating that speakers’ behavior can be modeled
    in a principled way by considering the probabilities of referents in the discourse and
    the information conveyed by each word.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume15referring},
}
@inproceedings{daume15rewrite,
  title     = {Syntax-based Rewriting for Simultaneous Machine Translation},
  author    = {He He and Alvin Grissom II and Jordan Boyd-Graber and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2015},
  abstract  = {
    Divergent word order between languages
    causes  delay  in  simultaneous  machine
    translation. We present a sentence rewrit-
    ing  method  that  generates  more  mono-
    tonic  translations  to  improve  the  speed-
    accuracy  tradeoff.  We  design  grammati-
    cality  and  meaning-preserving  syntactic
    transformation rules that operate on con-
    stituent  parse  trees.  We  apply  the  rules
    to  reference  translations  to  make  their
    word order closer to the source language
    word order. On Japanese-English transla-
    tion (two languages with substantially dif-
    ferent structure), incorporating the rewrit-
    ten, more monotonic reference translation
    into  a  phrase-based  machine  translation
    system  enables  better  translations  faster
    than a baseline system that only uses gold
    reference translations.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume15rewrite},
}
@inproceedings{daume15zeropronoun,
  title     = {Dialogue focus tracking for zero pronoun resolution},
  author    = {Sudha Rao and Allyson Ettinger and Hal {Daum\'e III} and Philip Resnik},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2015},
  abstract  = {
    We take a novel approach to zero pronoun resolution in Chinese: our model explicitly tracks
    the ﬂow of focus in a discourse. Our approach,
    which generalizes to deictic references, is not
    reliant on the presence of overt noun phrase
    antecedents to resolve to, and allows us to address the large percentage of “non-anaphoric”
    pronouns ﬁltered out in other approaches.
    We furthermore train our model using readily available parallel Chinese/English corpora,
    allowing for training without hand-annotated
    data. Our results demonstrate improvements
    on two test sets, as well as the usefulness of
    linguistically motivated features.
  },
  keywords  = {NLP},
  url       = {http://pub.hal3.name/#daume15zeropronoun},
}
@inproceedings{daume16ask,
  title     = {Ask, and Shall You Receive? Understanding Desire Fulfillment in Natural Language Text},
  author    = {Snigdha Chaturvedi and Dan Goldwasser and Hal {Daum\'e III}},
  booktitle = {Proceedings of the National Conference on Artificial Intelligence (AAAI)},
  year      = {2016},
  abstract  = {
    The ability to comprehend wishes or desires and their fulfillment is important to Natural Language Understanding. This paper introduces the task of identifying if a desire expressed by a subject in a given short piece of text was fulfilled. We propose various unstructured and structured models that capture fulfillment cues such as the subject's emotional state and actions. Our experiments with two different datasets demonstrate the importance of understanding the narrative and discourse structure to address this task.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16ask},
}
@inproceedings{daume16autoencode,
  title     = {Learning Text Pair Similarity with Context-sensitive Autoencoders},
  author    = {Hadi Amiri and Philip Resnik and Jordan Boyd-Graber and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2016},
  abstract  = {
    We present a pairwise context-sensitive
    Autoencoder for computing text pair similarity. Our model encodes input text
    into context-sensitive representations and
    uses them to compute similarity between
    text pairs. Our model outperforms the
    state-of-the-art models in two semantic retrieval tasks and a contextual word similarity task. For retrieval, our unsupervised
    approach that merely ranks inputs with respect to the cosine similarity between their
    hidden representations shows comparable
    performance with the state-of-the-art supervised models and in some cases outperforms them.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16autoencode},
}
@inproceedings{daume16churn,
  title     = {Short Text Representation for Detecting Churn in Microblogs},
  author    = {Hadi Amiri and Hal {Daum\'e III}},
  booktitle = {Proceedings of the National Conference on Artificial Intelligence (AAAI)},
  year      = {2016},
  abstract  = {
    Churn happens when a customer leaves a brand or stop us-
    ing its services. Brands reduce their churn rates by identi-
    fying and retaining potential churners through customer re-
    tention campaigns. In this paper, we consider the problem of
    classifying micro-posts as churny or non-churny with respect
    to a given brand. Motivated by the recent success of recur-
    rent neural networks (RNNs) in word representation, we pro-
    pose to utilize RNNs to learn micro-post and churn indicator
    representations. We show that such representations improve
    the performance of churn detection in microblogs and lead
    to more accurate ranking of churny contents. Furthermore, in
    this research we show that state-of-the-art sentiment analysis
    approaches fail to identify churny contents. Experiments on
    Twitter data about three telco brands show the utility of our
    approach for this task.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16churn},
}
@inproceedings{daume16clpsych,
  title     = {The UMD CLPsych 2016 Shared Task System: Text Representation for Predicting Triage of Forum Posts about Mental Health},
  author    = {Meir Friedenberg and Hadi Amiri and Hal {Daum\'e III} and Philip Resnik},
  booktitle = {Workshop on CL for Clinical Psychology},
  year      = {2016},
  abstract  = {
    We report on a multiclass classifier for triage of mental health forum posts as part of the CLPsych 2016 shared task. We investigate a number of document representations, including topic models and representation learning to represent posts in semantic space, including context-and emotion-sensitive feature representations of posts.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16clpsych},
}
@inproceedings{daume16comics,
  title     = {The Amazing Mysteries of the Gutter: Drawing Inferences Between Panels in Comic Book Narratives},
  author    = {Mohit Iyyer and Varun Manjunatha and Anupam Guha and Yogarshi Vyas and Jordan Boyd-Graber and Hal {Daum\'e III} and Larry Davis},
  booktitle = {CVPR},
  year      = {2016},
  abstract  = {
    Visual narrative is often a combination of explicit information and judicious omissions, relying on the viewer to
    supply missing details. In comics, most movements in time
    and space are hidden in the “gutters” between panels. To
    follow the story, readers logically connect panels together
    by inferring unseen actions through a process called “closure”. While computers can now describe what is explicitly depicted in natural images, in this paper we examine
    whether they can understand the closure-driven narratives
    conveyed by stylized artwork and dialogue in comic book
    panels. We construct a dataset, COMICS, that consists of
    over 1.2 million panels (120 GB) paired with automatic
    textbox transcriptions. An in-depth analysis of COMICS
    demonstrates that neither text nor image alone can tell a
    comic book story, so a computer must understand both
    modalities to keep up with the plot. We introduce three
    cloze-style tasks that ask models to predict narrative and
    character-centric aspects of a panel given n preceding panels as context. Various deep neural architectures underperform human baselines on these tasks, suggesting that
    COMICS contains fundamental challenges for both vision
    and language.
  },
  keywords  = {vision},
  url       = {http://pub.hal3.name/#daume16comics},
}
@inproceedings{daume16compiler,
  title     = {A Credit Assignment Compiler for Joint Prediction},
  author    = {Kai-Wei Chang and He He and St\'ephane Ross and Hal {Daum\'e III} and John Langford},
  booktitle = {Advances in Neural Information Processing Systems (NeurIPS)},
  year      = {2016},
  abstract  = {
    Many machine learning applications involve jointly predicting multiple mutually
    dependent output variables. Learning to search is a family of methods where the
    complex decision problem is cast into a sequence of decisions via a search space.
    Although these methods have shown promise both in theory and in practice, implementing them has been burdensomely awkward. In this paper, we show the
    search space can be deﬁned by an arbitrary imperative program, turning learning
    to search into a credit assignment compiler. Altogether with the algorithmic improvements for the compiler, we radically reduce the complexity of programming
    and the running time. We demonstrate the feasibility of our approach on multiple joint prediction tasks. In all cases, we obtain accuracies as high as alternative
    approaches, at drastically reduced execution and programming time.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume16compiler},
}
@inproceedings{daume16feuding,
  title     = {Feuding Families and Former Friends: Unsupervised Learning for Dynamic Fictional Relationships},
  author    = {Mohit Iyyer and Anupam Guha and Snigdha Chaturvedi and Jordan Boyd-Graber and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2016},
  abstract  = {
    Understanding how a ﬁctional relationship between two characters changes over time (e.g.,
    from best friends to sworn enemies) is a key
    challenge in digital humanities scholarship. We
    present a novel unsupervised neural network
    for this task that incorporates dictionary learning to generate interpretable, accurate relationship trajectories. While previous work on characterizing literary relationships relies on plot
    summaries annotated with predeﬁned labels,
    our model jointly learns a set of global relationship descriptors as well as a trajectory
    over these descriptors for each relationship in
    a dataset of raw text from novels. We ﬁnd that
    our model learns descriptors of events (e.g.,
    marriage or murder) as well as interpersonal
    states (love, sadness). Our model outperforms
    topic model baselines on two crowdsourced
    tasks, and we also ﬁnd interesting correlations
    to annotations in an existing dataset.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16feuding},
  award = {Best Paper Award},
}
@inproceedings{daume16image,
  title     = {Large scale retrieval and generation of image descriptions},
  author    = {Vicente Ord\'o\~nez and Xufeng Han and Polina Kuznetsova and Girish Kulkarni and Margaret Mitchell and Kota Yamaguchi and Karl Stratos and Amit Goyal and Jesse Dodge and Alyssa Mensch and Hal {Daum\'e III} and Alexander C. Berg and Yejin Choi and Tamara L. Berg},
  booktitle = {IJCV},
  year      = {2016},
  abstract  = {
    What is the story of an image? What is the
    relationship between pictures, language, and information we can extract using state of the art computational
    recognition systems? In an attempt to address both
    of these questions, we explore methods for retrieving
    and generating natural language descriptions for images. Ideally, we would like our generated textual descriptions (captions) to both sound like a person wrote
    them, and also remain true to the image content. To do
    this we develop data-driven approaches for image description generation, using retrieval-based techniques to
    gather either: (a) whole captions associated with a visually similar image, or (b) relevant bits of text (phrases)
  },
  keywords  = {vision},
  url       = {http://pub.hal3.name/#daume16image},
}
@inproceedings{daume16impact,
  title     = {Predicting the impact of scientific concepts using full‐text features},
  author    = {Kathy McKeown and Hal {Daum\'e III} and Snigdha Chaturvedi and John Paparrizos and Kapil Thadani and Pablo Barrio and Or Biran and Suvarna Bothe and Michael Collins and Kenneth R Fleischmann and Luis Gravano and Rahul Jha and Ben King and Kevin McInerney and Taesun Moon and Arvind Neelakantan and Diarmuid O'Seaghdha and Dragomir Radev and Clay Templeton and Simone Teufel},
  booktitle = {JAIST},
  year      = {2016},
  abstract  = {
    New  scientific  concepts,  interpreted  broadly,  are  con-
    tinuously  introduced  in  the  literature,  but  relatively
    few concepts have a long-term impact on society. The
    identification of such concepts is a challenging predic-
    tion  task  that  would  help  multiple  parties—including
    researchers  and  the  general  public—focus  their  atten-
    tion within the vast scientific literature. In this paper we
    present  a  system  that  predicts  the  future  impact  of  a
    scientific  concept,  represented  as  a  technical  term,
    based  on  the  information  available  from  recently  pub-
    lished  research  articles.  We  analyze  the  usefulness  of
    rich  features  derived  from  the  full  text  of  the  articles
    through  a  variety  of  approaches,  including  rhetorical
    sentence  analysis,  information  extraction,  and  time-
    series analysis. The results from two large-scale experi-
    ments  with  3.8  million  full-text  articles  and  48  million
    metadata  records  support  the  conclusion  that  full-text
    features are significantly more useful for prediction than
    metadata-only features and that the most accurate pre-
    dictions  result  from  combining  the  metadata  and  full-
    text features. Surprisingly, these results hold even when
    the  metadata  features  are  available  for  a  much  larger
    number of documents than are available for the full-text
    features.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16impact},
}
@inproceedings{daume16interpretese,
  title     = {Interpretese vs. Translationese: The Uniqueness of Human Strategies in Simultaneous Interpretation},
  author    = {He He and Jordan Boyd-Graber and Hal {Daum\'e III}},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2016},
  abstract  = {
    Computational approaches to simultaneous in-
    terpretation are stymied by how little we know
    about the tactics human interpreters use. We
    produce a parallel corpus of translated and si-
    multaneously interpreted text and study differ-
    ences between them through a computational
    approach. Our analysis reveals that human in-
    terpreters regularly apply several effective tac-
    tics to reduce translation latency, including sen-
    tence segmentation and passivization. In addi-
    tion to these unique, clever strategies, we show
    that limited human memory also causes other
    idiosyncratic properties of human interpreta-
    tion  such  as generalization  and  omission  of
    source content.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16interpretese},
}
@inproceedings{daume16literary,
  title     = {Modeling Evolving Relationships Between Characters in Literary Novels},
  author    = {Snigdha Chaturvedi and Shashank Srivastava and Hal {Daum\'e III} and Chris Dyer},
  booktitle = {Proceedings of the National Conference on Artificial Intelligence (AAAI)},
  year      = {2016},
  abstract  = {
    Studying characters plays a vital role in computationally rep-
    resenting and interpreting narratives. Unlike previous work,
    which has focused on inferring character roles, we focus on
    the problem of modeling their relationships. Rather than as-
    suming a fixed relationship for a character pair, we hypothe-
    size that relationships temporally evolve with the progress of
    the narrative, and formulate the problem of relationship mod-
    eling as a structured prediction problem. We propose a semi-
    supervised framework to learn relationship sequences from
    fully as well as partially labeled data. We present a Marko-
    vian model capable of accumulating historical beliefs about
    the relationship and status changes. We use a set of rich lin-
    guistic and semantically motivated features that incorporate
    world knowledge to investigate the textual content of narra-
    tive. We empirically demonstrate that such a framework out-
    performs competitive baselines.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16literary},
}
@inproceedings{daume16moses,
  title     = {A Framework for Discriminative Rule Selection in Hierarchical Moses},
  author    = {Fabienne Braune and Alexander Fraser and Hal {Daum\'e III} and Ale\v{s} Tamchyna},
  booktitle = {WMT},
  year      = {2016},
  abstract  = {
    Training discriminative rule selection
    models is usually expensive because of the
    very large size of the hierarchical grammar. Previous approaches reduced the
    training costs either by (i) using models that are local to the source side of
    the rules or (ii) by heavily pruning out
    negative samples. Moreover, all previous evaluations were performed on small
    scale translation tasks, containing at most
    250,000 sentence pairs. We propose two
    contributions to discriminative rule selection. First, we test previous approaches
    on two French-English translation tasks in
    domains for which only limited resources
    are available and show that they fail to
    improve translation quality. To improve
    on such tasks, we propose a rule selection model that is (i) global with rich
    label-dependent features (ii) trained with
    all available negative samples. Our global
    model yields signiﬁcant improvements, up
    to 1 BLEU point, over previously proposed rule selection models. Second, we
    successfully scale rule selection models
    to large translation tasks but have so far
    failed to produce signiﬁcant improvements
    in BLEU on these tasks.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume16moses},
}
@inproceedings{daume16opponent,
  title     = {Opponent Modeling in Deep Reinforcement Learning},
  author    = {He He and Jordan Boyd-Graber and Kevin Kwok and Hal {Daum\'e III}},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2016},
  abstract  = {
    Opponent modeling is necessary in multi-agent
    settings where secondary agents with competing
    goals also adapt their strategies, yet it remains
    challenging because strategies interact with each
    other and change. Most previous work focuses on
    developing probabilistic models or parameterized
    strategies for speciﬁc applications. Inspired by
    the recent success of deep reinforcement learning,
    we present neural-based models that jointly learn
    a policy and the behavior of opponents. Instead
    of explicitly predicting the opponent’s action, we
    encode observation of the opponents into a deep
    Q-Network (DQN); however, we retain explicit
    modeling (if desired) using multitasking. By using a Mixture-of-Experts architecture, our model
    automatically discovers different strategy patterns
    of opponents without extra supervision. We evaluate our models on a simulated soccer game and
    a popular trivia game, showing superior performance over DQN and its variants.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume16opponent},
}
@inproceedings{daume16recalltree,
  title     = {Logarithmic Time One-Against-Some},
  author    = {Hal {Daum\'e III} and Nikos Karampatziakis and John Langford and Paul Mineiro},
  booktitle = {ICML},
  year      = {2016},
  abstract  = {
    We create a new online reduction of multiclass classiﬁcation to binary classiﬁcation for which training and prediction time scale logarithmically with the number
    of classes. We show that several simple techniques give rise to an algorithm that
    can compete with one-against-all in both space and predictive power while offering exponential improvements in speed when the number of classes is large.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume16recalltree},
}
@inproceedings{daume17banditmt,
  title     = {The UMD Neural Machine Translation Systems [at WMT17 Bandit Learning Task]},
  author    = {Amr Sharaf and Shi Feng and Khanh Nguyen and Kianté Brantley and Hal {Daum\'e III}},
  booktitle = {WMT},
  year      = {2017},
  link = {https://github.com/khanhptnk/bandit-nmt},
  abstract  = {
    We describe the University of Maryland machine translation systems submitted to the WMT17 German-English Bandit Learning Task. The task is to adapt a
    translation system to a new domain, using
    only bandit feedback: the system receives
    a German sentence to translate, produces
    an English sentence, and only gets a scalar
    score as feedback. Targeting these two
    challenges (adaptation and bandit learning), we built a standard neural machine
    translation system and extended it in two
    ways: (1) robust reinforcement learning
    techniques to learn effectively from the
    bandit feedback, and (2) domain adaptation using data selection from a large corpus of parallel data.
  },
  keywords  = {NLP ML},
  url       = {http://pub.hal3.name/#daume17banditmt},
}
@inproceedings{daume17bioamr,
  title     = {Biomedical event extraction using abstract meaning representation},
  author    = {Sudha Rao and Daniel Marcu and Kevin Knight and Hal {Daum\'e III}},
  booktitle = {BioNLP},
  year      = {2017},
  abstract  = {
    We propose a novel, Abstract Meaning
    Representation (AMR) based approach to
    identifying molecular events/interactions
    in biomedical text. Our key contributions
    are: (1) an empirical validation of our hypothesis that an event is a subgraph of the
    AMR graph, (2) a neural network-based
    model that identiﬁes such an event subgraph given an AMR, and (3) a distant supervision based approach to gather additional training data. We evaluate our approach on the\
     2013 Genia Event Extraction dataset1 (Kim et al., 2013) and show
    promising results.
  },
  keywords  = {NLP},
  url       = {http://pub.hal3.name/#daume17bioamr},
}
@inproceedings{daume17blgnlp,
  title     = {Proceedings of the First Workshop on Building Linguistically Generalizable NLP Systems},
  author    = {Emily Bender and Hal {Daum\'e III} and Allyson Ettinger and Sudha Rao},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2017},
  abstract  = {
    While the ﬁeld of natural language processing has made tremendous strides as a result of machine
    learning techniques, systems trained within this traditional model typically do not generalize well beyond
    the characteristics of their training data. Especially with the inﬂux of deep learning approaches in NLP,
    it is increasingly the case not only that systems are restricted in the conditions under which they work
    well—but also that we have little idea what exactly those conditions are.
    We believe that linguistic knowledge will be instrumental to addressing these issues, so for this workshop
    we designed a special shared task, with the goal of bringing together researchers from NLP and linguistics
    to test the true linguistic generalization capacities of NLP systems. In addition to the shared task, the
    workshop also welcomed research contribution papers on the topic of linguistically generalizable NLP
    systems.
  },
  keywords  = {NLP},
  url       = {http://pub.hal3.name/#daume17blgnlp},
}
@inproceedings{daume17coal,
  title     = {Active Learning for Cost-Sensitive Classification},
  author    = {Akshay Krishnamurthy and Alekh Agarwal and Tzu-Kuo Huang and Hal {Daum\'e III} and John Langford},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2017},
  abstract  = {
    We design an active learning algorithm for cost-sensitive multiclass classiﬁcation: problems where
    different errors have different costs. Our algorithm, COAL, makes predictions by regressing on each label’s
    cost and predicting the smallest. On a new example, it uses a set of regressors that perform well on past data
    to estimate possible costs for each label. It queries only the labels that could be the best, ignoring the sure
    losers. We prove COAL can be efﬁciently implemented for any regression family that admits squared loss
    optimization; it also enjoys strong guarantees with respect to predictive performance and labeling effort.
    We empirically compare COAL to passive learning, showing signiﬁcant improvements in labeling effort
    and test cost.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume17coal},
}
@inproceedings{daume17evolve,
  title     = {Unsupervised Learning of Evolving Relationships Between Literary Characters},
  author    = {Snigdha Chaturvedi and Mohit Iyyer and Hal {Daum\'e III}},
  booktitle = {Proceedings of the National Conference on Artificial Intelligence (AAAI)},
  year      = {2017},
  abstract  = {
    Understanding inter-character relationships is fundamental
    for understanding character intentions and goals in a narra-
    tive. This paper addresses unsupervised modeling of relation-
    ships between characters. We model relationships as dynamic
    phenomenon, represented as evolving sequences of latent
    states empirically learned from data. Unlike most previous
    work our approach is completely unsupervised. This enables
    data-driven inference of inter-character relationship types be-
    yond simple sentiment polarities, by incorporating lexical and
    semantic representations, and leveraging large quantities of
    raw text. We present three models based on rich sets of lin-
    guistic features that capture various cues about relationships.
    We compare these models with existing techniques and also
    demonstrate that relationship categories learned by our model
    are semantically coherent.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume17evolve},
}
@inproceedings{daume17gutter,
  title     = {The amazing mysteries of the gutter: Drawing inferences between panels in comic book narratives},
  author    = {Mohit Iyyer and Varun Manjunatha and Anupam Guha and Yogarshi Vyas and Jordan Boyd-Graber and Hal {Daum\'e III} and Larry Davis},
  booktitle = {Computer Vision and Pattern Recognition (CVPR)},
  year      = {2017},
  abstract  = {
    Visual narrative is often a combination of explicit information and judicious omissions, relying on the viewer to
    supply missing details. In comics, most movements in time
    and space are hidden in the “gutters” between panels. To
    follow the story, readers logically connect panels together
    by inferring unseen actions through a process called “closure”. While computers can now describe what is explicitly depicted in natural images, in this paper we examine
    whether they can understand the closure-driven narratives
    conveyed by stylized artwork and dialogue in comic book
    panels. We construct a dataset, COMICS, that consists of
    over 1.2 million panels (120 GB) paired with automatic
    textbox transcriptions. An in-depth analysis of COMICS
    demonstrates that neither text nor image alone can tell a
    comic book story, so a computer must understand both
    modalities to keep up with the plot. We introduce three
    cloze-style tasks that ask models to predict narrative and
    character-centric aspects of a panel given n preceding panels as context. Various deep neural architectures underperform human baselines on these tasks, suggesting that
    COMICS contains fundamental challenges for both vision
    and language.
  },
  keywords  = {CV},
  url       = {http://pub.hal3.name/#daume17gutter},
}
@inproceedings{daume17oas,
  title     = {Logarithmic time one-against-some},
  author    = {Hal {Daum\'e III} and Nikos Karampatziakis and John Langford and Paul Mineiro},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2017},
  abstract  = {
    We create a new online reduction of multiclass classiﬁcation to binary classiﬁcation for which training and prediction time scale logarithmically with the number
    of classes. We show that several simple techniques give rise to an algorithm that
    can compete with one-against-all in both space and predictive power while offering exponential improvements in speed when the number of classes is large.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume17oas},
}
@inproceedings{daume17simhuman,
  title     = {Reinforcement Learning for Bandit Neural Machine Translation with Simulated Human Feedback},
  author    = {Khanh Nguyen and Hal {Daum\'e III} and Jordan Boyd-Graber},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2017},
  abstract  = {
    Machine translation is a natural candidate
    problem for reinforcement learning from
    human feedback: users provide quick,
    dirty ratings on candidate translations to
    guide a system to improve. Yet, current
    neural machine translation training focuses on expensive human-generated reference translations. We describe a reinforcement learning algorithm that improves neural machi\
    ne translation systems from simulated human feedback.
    Our algorithm combines the advantage
    actor-critic algorithm (Mnih et al., 2016)
    with the attention-based neural encoderdecoder architecture (Luong et al., 2015).
    This algorithm (a) is well-designed for
    problems with a large action space and
    delayed rewards, (b) effectively optimizes
    traditional corpus-level machine translation metrics, and (c) is robust to skewed,
    high-variance, granular feedback modeled
    after actual human behaviors.
  },
  keywords  = {NLP ML},
  url       = {http://pub.hal3.name/#daume17simhuman},
}
@inproceedings{daume17spbandit,
  title     = {Structured prediction via learning to search under bandit feedback},
  author    = {Amr Sharaf and Hal {Daum\'e III}},
  booktitle = {Workshop on Structured Prediction for NLP},
  year      = {2017},
  abstract  = {
    We present an algorithm for structured
    prediction under online bandit feedback.
    The learner repeatedly predicts a sequence
    of actions, generating a structured output.
    It then observes feedback for that output
    and no others. We consider two cases: a
    pure bandit setting in which it only observes a loss, and more ﬁne-grained feedback in which it observes a loss for every
    action. We ﬁnd that the ﬁne-grained feedback is necessary for strong empirical performance, because it allows for a robust
    variance-reduction strategy. We empirically compare a number of different algorithms and exploration methods and show
    the efﬁcacy of BLS on sequence labeling
    and dependency parsing tasks.
  },
  keywords  = {NLP ML},
  url       = {http://pub.hal3.name/#daume17spbandit},
}
@inproceedings{daume18clarification,
  title     = {Learning to Ask Good Questions: Ranking Clarification Questions using Neural Expected Value of Perfect Information},
  author    = {Sudha Rao and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2018},
  abstract  = {
    Inquiry is fundamental to communication,
    and machines cannot effectively collaborate with humans unless they can ask questions. In this work, we build a neural network model for the task of ranking clariﬁcation questions. Our model is inspired by
    the idea of expected value of perfect information: a good question is one whose expected answer will be useful. We study this
    problem using data from StackExchange,
    a plentiful online resource in which people
    routinely ask clarifying questions to posts
    so that they can better offer assistance to
    the original poster. We create a dataset of
    clariﬁcation questions consisting of ∼77K
    posts paired with a clariﬁcation question (and answer) from three domains
    of StackExchange: askubuntu, unix and
    superuser. We evaluate our model on 500
    samples of this dataset against expert human judgments and demonstrate signiﬁcant improvements over controlled baselines.
  },
  keywords  = {NLP},
  url       = {http://pub.hal3.name/#daume18clarification},
  award = {Best Paper Award},
}
@inproceedings{daume18datasheets,
  title     = {Datasheets for Datasets},
  author    = {Timnit Gebru and Jamie Morgenstern and Briana Vecchione and Jennifer Wortman Vaughan and Hanna Wallach and Hal {Daum\'e III} and Kate Crawford},
  booktitle = {arxiv},
  year      = {2018},
  abstract  = {
    Currently there is no standard way to identify how a dataset was created, and what
    characteristics, motivations, and potential skews it represents. To begin to address
    this issue, we propose the concept of a datasheet for datasets, a short document to
    accompany public datasets, commercial APIs, and pretrained models. The goal of
    this proposal is to enable better communication between dataset creators and users,
    and help the AI community move toward greater transparency and accountability.
    By analogy, in computer hardware, it has become industry standard to accompany
    everything from the simplest components (e.g., resistors), to the most complex microprocessor chips, with datasheets detailing standard operating characteristics, test
    results, recommended usage, and other information. We outline some of the questions
    a datasheet for datasets should answer. These questions focus on when, where, and
    how the training data was gathered, its recommended use cases, and, in the case of
    human-centric datasets, information regarding the subjects’ demographics and consent as applicable. We develop prototypes of datasheets for two well-known datasets:
    Labeled Faces in The Wild [33] and the Pang & Lee Polarity Dataset [45].
  },
  keywords  = {ML},
  url       = {http://pub.hal3.name/#daume18datasheets},
}
@inproceedings{daume18hierilrl,
  title     = {Hierarchical Imitation and Reinforcement Learning},
  author    = {Hoang M Le and Nan Jiang and Alekh Agarwal and Miroslav Dud\'ik and Yisong Yue and Hal {Daum\'e III}},
  booktitle = {ICML},
  year      = {2018},
  link = {https://sites.google.com/view/hierarchical-il-rl},
  abstract  = {
    We study the problem of learning policies over
    long time horizons. We present a framework that
    leverages and integrates two key concepts. First,
    we utilize hierarchical policy classes that enable
    planning over different time scales, i.e., the high
    level planner proposes a sequence of subgoals for
    the low level planner to achieve. Second, we utilize expert demonstrations within the hierarchical
    action space to dramatically reduce cost of exploration. Our framework is ﬂexible and can incorporate different combinations of imitation learning (IL) and reinforcement \
    learning (RL) at different levels of the hierarchy. Using long-horizon
    benchmarks, including Montezuma’s Revenge,
    we empirically demonstrate that our approach
    can learn signiﬁcantly faster compared to hierarchical RL, and can be signiﬁcantly more labeland sample-efﬁcient compared to ﬂat IL. We also
    provide theoretical analysis of the labeling cost
    for certain instantiations of our framework.
  },
  keywords  = {ML},
  url       = {http://pub.hal3.name/#daume18hierilrl},
}
@inproceedings{daume18neuripsdi,
  title     = {NeurIPS 2018 Demographics and Inclusion Survey: Summary of Responses},
  author    = {Daum\'e, III, Hal and Katherine Heller},
  booktitle = {NeurIPS (not a normal paper)},
  year      = {2018},
  abstract  = {
    We report the results of a survey conducted from August–October 2018 on demographics & inclusion in the
    NeurIPS community. At analysis, 2375 people participated; the range of responses is vast. Here, we attempt to
    capture the key themes, with pointers to where more information can be found. Such a summary runs the
    risk of ignoring concerns of some members; we encourage all interested to read the full report. The below
    concerns are listed arbitrarily; there is no implied priority. At the NeurIPS 2018 conference, during the lunch
    period on Tuesday, there will be a moderated and guided townhall; one goal is to develop action items to
    improve the level of respect and inclusion at the conference. Thank you to all participants.
  },
  keywords  = {ml},
  link = {https://github.com/hal3/neurips2018survey},
  url       = {http://pub.hal3.name/#daume18neuripsdi},
}
@inproceedings{daume18poisoning,
  title     = {When Does Machine Learning {FAIL}? Generalized Transferability for Evasion and Poisoning Attacks},
  author    = {Octavian Suciu and Radu M\u{a}rginean and Yi\u{g}itcan Kaya and Hal {Daum\'e III} and Tudor Dumitra\c{s}},
  booktitle = {USENIX},
  year      = {2018},
  abstract  = {
    Recent results suggest that attacks against supervised machine learning systems are quite effective, while defenses are easily bypassed by new attacks. However, thespeciﬁcationsformachinelearningsystemscurrently lack precise adversary deﬁnitions, and the existing attacks make diverse, potentially unrealistic assumptions about the strength of the adversary who launches them. We propose the FAIL attacker model, which describes theadversary’sknowledgeandcontrolalongfourdimensions. Our model allows us to consider a wide range of weaker adversaries who have limited control and incomplete knowledge of the features, learning algorithms and training instances utilized. ToevaluatetheutilityoftheFAILmodel,weconsider the problem of conducting targeted poisoning attacks in a realistic setting: the crafted poison samples must have cleanlabels,mustbeindividuallyandcollectivelyinconspicuous, and must exhibit a generalized form of transferability, deﬁned by the FAIL model. By taking these constraints into account, we design StingRay, a targeted poisoningattackthatispracticalagainst4machinelearning applications, which use 3 different learning algorithms, and can bypass 2 existing defenses. Conversely, weshowthatapriorevasionattackislesseffectiveunder generalized transferability. Such attack evaluations, undertheFAILadversarymodel,mayalsosuggestpromising directions for future defenses.
  },
  keywords  = {systems ml},
  url       = {http://pub.hal3.name/#daume18poisoning},
}
@inproceedings{daume18reslope,
  title     = {Residual Loss Prediction: Reinforcement Learning with no Incremental Feedback},
  author    = {Hal {Daum\'e III} and John Langford and Amr Sharaf},
  booktitle = {ICLR},
  year      = {2018},
  link = {https://github.com/hal3/reslope},
  abstract  = {
    We consider reinforcement learning and bandit structured prediction problems
    with very sparse loss feedback: only at the end of an episode. We introduce a novel
    algorithm, RESIDUAL LOSS PREDICTION (RESLOPE), that solves such problems
    by automatically learning an internal representation of a denser reward function.
    RESLOPE operates as a reduction to contextual bandits, using its learned loss representation to solve the credit assignment problem, and a contextual bandit oracle
    to trade-off exploration and exploitation. RESLOPE enjoys a no-regret reductionstyle theoretical guarantee and outperforms state of the art reinforcement learning
    algorithms in both MDP environments and bandit structured prediction settings.
  },
  keywords  = {ML NLP},
  url       = {http://pub.hal3.name/#daume18reslope},
}
@inproceedings{daume18summarization,
  title     = {Content Selection in Deep Learning Models of Summarization},
  author    = {Chris Kedzie and Kathleen {McKeown} and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2018},
  link = {https://github.com/kedz/nnsum/tree/emnlp18-release},
  abstract  = {
    We carry out experiments with deep learning
    models of summarization across the domains
    of news, personal stories, meetings, and medical articles in order to understand how content
    selection is performed. We find that many sophisticated features of state of the art extractive
    summarizers do not improve performance over
    simpler models. These results suggest that it
    is easier to create a summarizer for a new domain than previous work suggests and bring
    into question the benefit of deep learning models for summarization for those domains that
    do have massive datasets (i.e., news). At the
    same time, they suggest important questions
    for new research in summarization; namely,
    new forms of sentence representations or external knowledge sources are needed that are
    better suited to the summarization task.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume18summarization},
}
@inproceedings{daume19awesome,
  title     = {Warm-starting Contextual Bandits: Robustly Combining Supervised and Bandit Feedback},
  author    = {Chicheng Zhang and Alekh Agarwal and Hal {Daum\'e III} and John Langford and Sahand N Negahban},
  booktitle = {ICML},
  year      = {2019},
  abstract  = {
    We investigate the feasibility of learning from both fully-labeled supervised data and contextual bandit data. We specifically consider settings in which the underlying learning signal may be different between these two data sources. Theoretically, we state and prove no-regret algorithms for learning that is robust to divergences between the two sources. Empirically, we evaluate some of these algorithms on a large selection of datasets, showing that our approaches are feasible, and helpful in practice.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume19awesome},
}
@inproceedings{daume19convexrl,
  title     = {Reinforcement Learning with Convex Constraints},
  author    = {Sobhan Miryoosefi and Kiant\'e Brantley and Hal {Daum\'e III} and Miroslav Dud\'ik and Robert Schapire},
  booktitle = {NeurIPS},
  year      = {2019},
  abstract  = {
    In standard reinforcement learning (RL), a learning agent seeks to optimize the
    overall reward. However, many key aspects of a desired behavior are more naturally expressed as constraints. For instance, the designer may want to limit the use
    of unsafe actions, increase the diversity of trajectories to enable exploration, or
    approximate expert trajectories when rewards are sparse. In this paper, we propose
    an algorithmic scheme that can handle a wide class of constraints in RL tasks,
    specifically, any constraints that require expected values of some vector measurements (such as the use of an action) to lie in a convex set. This captures previously
    studied constraints (such as safety and proximity to an expert), but also enables
    new classes of constraints (such as diversity). Our approach comes with rigorous
    theoretical guarantees and only relies on the ability to approximately solve standard
    RL tasks. As a result, it can be easily adapted to work with any model-free or
    model-based RL algorithm. In our experiments, we show that it matches previous
    algorithms that enforce safety via constraints, but can also enforce new properties
    that these algorithms cannot incorporate, such as diversity.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume19convexrl},
}
@inproceedings{daume19fairness,
  title     = {Improving fairness in machine learning systems: What do industry practitioners need?},
  author    = {Kenneth Holstein and Jennifer Wortman Vaughan and Hal {Daum\'e III} and Miroslav Dud\'ik and Hanna Wallach},
  booktitle = {CHI},
  year      = {2019},
  abstract  = {
    The potential for machine learning (ML) systems to amplify social inequities and unfairness is receiving increasing popular and academic attention. A surge of recent work has focused on the development of algorithmic tools to assess and mitigate such unfairness. If these tools are to have a positive impact on industry practice, however, it is crucial that their design be informed by an understanding of real-world needs. Through 35 semi-structured interviews and an anonymous survey of 267 ML practitioners, we conduct the first systematic investigation of commercial product teams' challenges and needs for support in developing fairer ML systems. We identify areas of alignment and disconnect between the challenges faced by industry practitioners and solutions proposed in the fair ML research literature. Based on these findings, we highlight directions for future ML and HCI research that will better address industry practitioners' needs.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume19fairness},
}
@inproceedings{daume19global,
  title     = {Global Voices: Crossing Borders in Automatic News Summarization},
  author    = {Khanh Nguyen and  Daum\'e, III, Hal},
  booktitle = {EMNLP Summarization Workshop},
  year      = {2019},
  abstract  = {
    We construct Global Voices, a multilingual dataset for evaluating cross-lingual summarization methods. We extract social-network descriptions of Global Voices news articles to cheaply collect evaluation data for into-English and from-English summarization in 15 languages. Especially, for the into-English summarization task, we crowd-source a high-quality evaluation dataset based on guidelines that emphasize accuracy, coverage, and understandability. To ensure the quality of this dataset, we collect human ratings to filter out bad summaries, and conduct a survey on humans, which shows that the remaining summaries are preferred over the social-network summaries. We study the effect of translation quality in cross-lingual summarization, comparing a translate-then-summarize approach with several baselines. Our results highlight the limitations of the ROUGE metric that are overlooked in monolingual summarization.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume19global},
}
@inproceedings{daume19hanna,
  title     = {Help, Anna! Visual Navigation with Natural Multimodal Assistance via Retrospective Curiosity-Encouraging Imitation Learning},
  author    = {Khanh Nguyen and  Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2019},
  abstract  = {
    Mobile agents that can leverage help from humans can potentially accomplish more complex tasks than they could entirely on their own. We develop "Help, Anna!" (HANNA), an interactive photo-realistic simulator in which an agent fulfills object-finding tasks by requesting and interpreting natural languageand-vision assistance. An agent solving tasks in a HANNA environment can leverage simulated human assistants, called ANNA (Automatic Natural Navigation Assistants), which, upon request, provide natural language and visual instructions to direct the agent towards the goals. To address the HANNA problem, we develop a memory-augmented neural agent that hierarchically models multiple levels of decision-making, and an imitation learning algorithm that teaches the agent to avoid repeating past mistakes while simultaneously predicting its own chances of making future progress. Empirically, our approach is able to ask for help more effectively than competitive baselines and, thus, attains higher task success rate on both previously seen and previously unseen environments. We publicly release code and data at this https URL .
  },
  keywords  = {nlp ml},
  url       = {http://pub.hal3.name/#daume19hanna},
  link = {https://github.com/khanhptnk/hanna},
}
@inproceedings{daume19melee,
  title     = {Meta-Learning for Contextual Bandit Exploration},
  author    = {Amr Sharaf and Hal {Daum\'e III}},
  booktitle = {arxiv},
  year      = {2019},
  link = {https://www.dropbox.com/sh/dc3v8po5cbu8zaw/AACu1f_4c4wIZxD1e7W0KVZ0a?dl=0},
  abstract  = {
    We describe MELEE, a meta-learning algorithm for learning a good exploration policy in the interactive contextual bandit setting. Here, an algorithm must take actions based on contexts, and learn based only on a reward signal from the action taken, thereby generating an exploration/exploitation trade-off. MELEE addresses this trade-off by learning a good exploration strategy for offline tasks based on synthetic data, on which it can simulate the contextual bandit setting. Based on these simulations, MELEE uses an imitation learning strategy to learn a good exploration policy that can then be applied to true contextual bandit tasks at test time. We compare MELEE to seven strong baseline contextual bandit algorithms on a set of three hundred real-world datasets, on which it outperforms alternatives in most settings, especially when differences in rewards are large. Finally, we demonstrate the importance of having a rich feature representation for learning how to explore.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume19melee},
}
@article{daume19moocs,
  title     = {Interpretable Engagement Models for {MOOC}s using Hinge-loss Markov Random Fields},
  author    = {Arti Ramesh and Dan Goldwasser and Bert Huang and Hal {Daum\'e III} and Lise Getoor},
  journal = {IEEE Transations on Learning Technologies},
  year      = {2019},
  abstract  = {
    Maintaining and cultivating student engagement is critical for learning. Understanding factors affecting student engagement
    can help in designing better courses and improving student retention. The large number of participants in massive open online courses
    (MOOCs) and data collected from their interactions on the MOOC open up avenues for studying student engagement at scale. In this
    work, we develop an interpretable statistical relational learning model for understanding student engagement in online courses using a
    complex combination of behavioral, linguistic, structural, and temporal cues. We show how to abstract student engagement types of
    active, passive, and disengagement as meaningful latent variables using logical rules in our model connecting student behavioral
    signals with student success in MOOCs. We demonstrate that the latent formulation for engagement helps in predicting two measures
    of student success: performance, their final grade in the course, and survival, their continued presence in the course till the end, across
    seven MOOCs. Further, in order to initiate better instructor interventions, we need to be able to predict student success early in the
    course. We demonstrate that we can predict student success early in the course reliably using the latent model. We also demonstrate
    the utility of our models in predicting student success in new courses, by training our models on one course and testing on another
    course. We show that the latent abstractions are helpful in predicting student success and engagement reliably in new MOOCs that
    haven’t yet gathered student interaction data. We then perform a closer quantitative analysis of different features derived from student
    interactions on the MOOC and identify student activities that are good indicators of student success at different points in the course.
    Through a qualitative analysis of the latent engagement variable values, we demonstrate their utility in understanding students’
    engagement levels at various points in the course and movement of students across different types of engagement.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume19moocs},
}
@inproceedings{daume19nonmon,
  title     = {Non-Monotonic Sequential Text Generation},
  author    = {Sean Welleck and Kiant\'e Brantley and Hal {Daum\'e III} and Kyunghyun Cho},
  booktitle = {ICML},
  year      = {2019},
  abstract  = {
    Standard sequential generation methods assume a pre-specified generation order, such as text generation methods which generate words from left to right. In this work, we propose a framework for training models of text generation that operate in non-monotonic orders; the model directly learns good orders, without any additional annotation. Our framework operates by generating a word at an arbitrary position, and then recursively generating words to its left and then words to its right, yielding a binary tree. Learning is framed as imitation learning, including a coaching method which moves from imitating an oracle to reinforcing the policy's own preferences. Experimental results demonstrate that using the proposed method, it is possible to learn policies which generate text without pre-specifying a generation order, while achieving competitive performance with conventional left-to-right generation.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume19nonmon},
}
@inproceedings{daume19readability,
  title     = {Comparing and Developing Tools to Measure the Readability of Domain-Specific Texts},
  author    = {Elissa Redmiles and Lisa Maszkiewicz and Emily Hwang and Dhruv Kuchhal and Everst Liu and Miraida Morales and Denis Peskov and Sudha Rao and Rock Stevens and Kristina Gligori\'c and Sean Kross and Michelle L. Mazurek and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2019},
  link = {http://github.com/SP2-MC2/Readability-Resources},
  abstract  = { 
                  The readability of a digital text can influence
                  people's ability to learn new things about a range
                  of topics from digital resources (e.g., Wikipedia,
                  WebMD). Readability also impacts search rankings,
                  and is used to evaluate the performance of NLP
                  systems. Despite this, we lack a thorough
                  understanding of how to validly measure readability
                  at scale, especially for domain-specific texts. In
                  this work, we present a comparison of the validity
                  of well-known readability measures and introduce a
                  novel approach, Smart Cloze, which is designed to
                  address short-comings of existing measures. We
                  compare these approaches across four different
                  corpora: crowdworker-generated stories, Wikipedia
                  articles, security and privacy advice, and health
                  information. On these corpora, we evaluate the
                  convergent and content validity of each measure, and
                  detail tradeoffs in score precision,
                  domain-specificity, and participant burden. These
                  results provide a foundation for more accurate
                  readability measurements and better evaluation of
                  new natural-language-processing systems and tools.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume19readability},
}
@inproceedings{daume20active,
  title     = {Active Imitation Learing with Noisy Guidance},
  author    = {Kiant\'e Brantley and Amr Sharaf and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2020},
  keywords  = {nlp,ml},
  url       = {http://pub.hal3.name/#daume20active},
}
@inproceedings{daume20alignments,
  title     = {On the Potential of Lexico-logical Alignments for Semantic Parsing to SQL Queries},
  author    = {Tianze Shi and Chen Zhao and Jordan Boyd-Graber and Hal {Daum\'e III} and Lillian Lee},
  booktitle = {Findings of EMNLP},
  year      = {2020},
  abstract  = {
    Large-scale semantic parsing datasets annotated with logical forms have enabled major advances in supervised approaches. But can richer supervision help even more? To explore the utility of fine-grained, lexical-level supervision, we introduce Squall, a dataset that enriches 11,276 WikiTableQuestions English-language questions with manually created SQL equivalents plus alignments between SQL and question fragments. Our annotation enables new training possibilities for encoder-decoder models, including approaches from machine translation previously precluded by the absence of alignments. We propose and test two methods: (1) supervised attention; (2) adopting an auxiliary objective of disambiguating references in the input queries to table columns. In 5-fold cross validation, these strategies improve over strong baselines by 4.4\% execution accuracy. Oracle experiments suggest that annotated alignments can support further accuracy gains of up to 23.9\%.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume20alignments},
  code = {https://www.github.com/tzshi/squall},
}
@inproceedings{daume20gicoref,
  title     = {Toward Gender-Inclusive Coreference Resolution},
  author    = {Yang Trista Cao and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2020},
  keywords  = {nlp,fate},
  url       = {http://pub.hal3.name/#daume20gicoref},
  abstract  = { 
    Correctly resolving textual mentions of people
    fundamentally entails making inferences about
    those people. Such inferences raise the risk of
    systemic biases in coreference resolution systems,
    including biases that can harm binary
    and non-binary trans and cis stakeholders. To
    better understand such biases, we foreground
    nuanced conceptualizations of gender from sociology
    and sociolinguistics, and develop two
    new datasets for interrogating bias in crowd
    annotations and in existing coreference resolution
    systems. Through these studies, conducted
    on English text, we confirm that without
    acknowledging and building systems that
    recognize the complexity of gender, we build
    systems that lead to many potential harms.
  },
}
@inproceedings{daume20minimization,
  title     = {Operationalizing the Legal Principle of Data Minimization for Personalization},
  author    = {Asia J. Biega and Peter Potash and Daum\'e, III, Hal and Fernando Diaz and Mich\`ele Finck},
  booktitle = {Proceedings of the Conference on Research and Developments in Information Retrieval (SIGIR)},
  year      = {2020},
  keywords  = {ml,fate},
  url       = {http://pub.hal3.name/#daume20minimization},
  abstract = {
    Article 5(1)(c) of the European Union’s General Data Protection
    Regulation (GDPR) requires that ``personal data shall be [...] adequate,
    relevant, and limited to what is necessary in relation to the
    purposes for which they are processed (`data minimisation')''. To
    date, the legal and computational definitions of `purpose limitation'
    and `data minimization' remain largely unclear. In particular, the
    interpretation of these principles is an open issue for information
    access systems that optimize for user experience through personalization
    and do not strictly require personal data collection for the
    delivery of basic service.
    
    In this paper, we identify a lack of a homogeneous interpretation
    of the data minimization principle and explore two operational
    definitions applicable in the context of personalization. The focus
    of our empirical study in the domain of recommender systems
    is on providing foundational insights about the (i) feasibility of
    different data minimization definitions, (ii) robustness of different
    recommendation algorithms to minimization, and (iii) performance
    of different minimization strategies. We find that the performance
    decrease incurred by data minimization might not be substantial,
    but that it might disparately impact different users--a finding which
    has implications for the viability of different formal minimization
    definitions. Overall, our analysis uncovers the complexities of the
    data minimization problem in the context of personalization and
    maps the remaining computational and regulatory challenges.
  },
}
@inproceedings{daume20nmtadapt,
  title     = {Meta-learning for Few-Shot NMT Adaptation},
  author    = {Amr Sharaf and Hany Hassan and Daum\'e, III, Hal},
  booktitle = {WNGT@ACL},
  year      = {2020},
  abstract  = {
    We present META-MT, a meta-learning approach to adapt Neural Machine Translation
    (NMT) systems in a few-shot setting. M ETA MT provides a new approach to make NMT
    models easily adaptable to many target domains with the minimal amount of in-domain
    data. We frame the adaptation of NMT systems as a meta-learning problem, where we
    learn to adapt to new unseen domains based on
    simulated offline meta-training domain adaptation tasks. We evaluate the proposed metalearning strategy on ten domains with general large scale NMT systems. We show
    that M ETA -MT significantly outperforms classical domain adaptation when very few indomain examples are available. Our experiments shows that M ETA -MT can outperform
    classical fine-tuning by up to 2.5 BLEU points
    after seeing only 4, 000 translated words (300
    parallel sentences).
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume20nmtadapt},
  award = {Best Paper Award},
}
@inproceedings{daume20power,
  title     = {Language (technology) is Power: A Critical Survey of ``Bias'' in NLP},
  author    = {Su Lin Blodgett and Solon Barocas and Daum\'e, III, Hal and Hanna Wallach},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2020},
  keywords  = {nlp,fate},
  url       = {http://pub.hal3.name/#daume20power},
  abstract  = { 
    We survey 146 papers analyzing ``bias'' in NLP systems,
    finding that their motivations are often vague,
    inconsistent, and lacking in normative reasoning,
    despite the fact that analyzing ``bias'' is an
    inherently normative process. We further find that
    these papers’ proposed quantitative techniques for
    measuring or mitigating ``bias'' are poorly matched to
    their motivations and do not engage with the
    relevant literature outside of NLP. Based on these
    findings, we describe the beginnings of a path
    forward by proposing three recommendations that
    should guide work analyzing ``bias'' in NLP
    systems. These recommendations rest on a greater
    recognition of the relationships between language
    and social hierarchies, encouraging researchers and
    practitioners to articulate their conceptualizations
    of ``bias''--i.e., what kinds of system behaviors are
    harmful, in what ways, to whom, and why, as well as
    the normative reasoning underlying these
    statements--and to center work around the lived
    experiences of members of communities affected by
    NLP systems, while interrogating and reimagining the
    power relations between technologists and such
    communities.
  },
}
@inproceedings{daume21beamdr,
  title     = {Multi-Step Reasoning Over Unstructured Text with Beam Dense Retrieval},
  author    = {Chen Zhao and Chenyan Xiong and Jordan Boyd-Graber and Daum\'e, III, Hal},
  booktitle = {NAACL (short)},
  year      = {2021},
  abstract  = {
    Complex question answering often requires
    finding a reasoning chain that consists of multiple evidence pieces. Current approaches incorporate the strengths of structured knowledge and unstructured text, assuming text corpora is semi-structured. Building on dense retrieval methods, we propose a new multi-step
    retrieval approach (B EAM DR) that iteratively
    forms an evidence chain through beam search
    in dense representations. When evaluated on
    multi-hop question answering, B EAM DR is
    competitive to state-of-the-art systems, without using any semi-structured information.
    Through query composition in dense space,
    B EAM DR captures the implicit relationships
    between evidence in the reasoning chain. The
    code is available at https://github.com/
    henryzhao5852/BeamDR
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume21beamdr},
}
@inproceedings{daume21covid,
  title     = {Responsible Computing During COVID-19 and Beyond},
  author    = {Solon Barocas and Asia J. Biega and Margarita Boyarskaya and Kate Crawford and Hal {Daum\'e III} and Miroslav Dud\'ik and Benjamin Fish and Mary L. Gray and Brent Hecht and Alexandra Olteanu and Forough Poursabzi-Sangdeh and Luke Stark and Jennifer Wortman Vaughan and Hanna Wallach and Marion Zepf},
  booktitle = {CACM},
  year      = {2021},
  abstract  = {
    The COVID-19 pandemic has both created and exacerbated a series of cascading and interrelated crises whose impacts continue to reverberate. From the immediate effects on people's health to the pressures on healthcare systems and mass unemployment, millions of people are suffering. For many of us who work in the digital technology industry, our first impulse may be to devise technological solutions to what we perceive as the most urgent problems when faced by crises such as these. Although the desire to put our expertise to good use is laudable, technological solutions that fail to consider broader social, political, and economic contexts can have unintended consequences, undermining their efficacy and even harming the very communities that they are intended to help.10 To ensure our contributions achieve their intended results without causing inadvertent harm, we must think carefully about which projects we work on, how we should go about working on them, and with whom such work should be done. In this column, we offer a series of guidelines for navigating these choices. As current and former members of the Fairness, Accountability, Transparency, and Ethics (FATE) group at Microsoft Research, we have been working actively on the ethical and societal impacts of technologies such as artificial intelligence since 2016. While we originally developed these guidelines to help our colleagues at Microsoft respond to the first wave of the pandemic in the spring of 2020, we believe they are general enough that their value extends beyond Microsoft and beyond projects focused on the COVID-19 pandemic.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume21covid},
}
@inproceedings{daume21distqa,
  title     = {Distantly-Supervised Evidence Retrieval Enables Question Answering without Annotated Evidence Pieces},
  author    = {Chen Zhao and Chenyan Xiong and Jordan Boyd-Graber and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2021},
  abstract  = {
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume21distqa},
}
@inproceedings{daume21flourishing,
  title     = {Supporting human flourishing by ensuring human involvement in AI-infused systems},
  author    = {Joel Chan and Daum\'e, III, Hal and John P. Dickerson and Hernisa Kacorri and Ben Shneiderman},
  booktitle = {HCAI Workshop at NeurIPS 2021},
  year      = {2021},
  abstract  = {
    Researchers, developers, business leaders, policy makers and others are expanding
    the technology-centered scope of Artificial Intelligence (AI) to include HumanCentered AI (HCAI) ways of thinking. This expansion from an algorithm-focused
    view to embrace a human-centered perspective, can shape the future of technology
    so as to better serve human needs. Educators, designers, software engineers,
    product managers, evaluators, and government agency staffers can build on AIinfused technologies to design products and services that make life better for the
    users. By switching the scope from technology-centered to human-centered, we
    can build AI-infused tools that enable people to better care for each other, build
    sustainable communities, and restore the environment.
  },
  keywords  = {hci ml},
  url       = {http://pub.hal3.name/#daume21flourishing},
}
@inproceedings{daume21novice,
  title     = {A Novice-Reviewer Experiment to Address Scarcity of Qualified Reviewers in Large Conferences},
  author    = {Ivan Stelmakh and Nihar B. Shah and Aarti Singh and Daum\'e, III, Hal},
  booktitle = {AAAI},
  year      = {2021},
  abstract  = {
    Conference peer review constitutes a human-computation process whose importance cannot be overstated: not only it identifies the best submissions for acceptance, but, ultimately, it impacts the future
    of the whole research area by promoting some ideas and restraining others. A surge in the number of
    submissions received by leading AI conferences has challenged the sustainability of the review process by
    increasing the burden on the pool of qualified reviewers which is growing at a much slower rate. In this
    work, we consider the problem of reviewer recruiting with a focus on the scarcity of qualified reviewers
    in large conferences. Specifically, we design a procedure for (i) recruiting reviewers from the population
    not typically covered by major conferences and (ii) guiding them through the reviewing pipeline. In
    conjunction with ICML 2020 — a large, top-tier machine learning conference — we recruit a small set of
    reviewers through our procedure and compare their performance with the general population of ICML reviewers. Our experiment reveals that a combination of the recruiting and guiding mechanisms allows for
    a principled enhancement of the reviewer pool and results in reviews of superior quality compared to the
    conventional pool of reviews as evaluated by senior members of the program committee (meta-reviewers).
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume21novice},
}
@inproceedings{daume21resubmit,
  title     = {Prior and Prejudice: The Novice Reviewers' Bias against Resubmissions in Conference Peer Review},
  author    = {Ivan Stelmakh and Nihar B. Shah and Aarti Singh and Daum\'e, III, Hal},
  booktitle = {CSCW},
  year      = {2021},
  abstract  = {
    Modern machine learning and computer science conferences are experiencing a surge in the number of
    submissions that challenges the quality of peer review as the number of competent reviewers is growing
    at a much slower rate. To curb this trend and reduce the burden on reviewers, several conferences have
    started encouraging or even requiring authors to declare the previous submission history of their papers.
    Such initiatives have been met with skepticism among authors, who raise the concern about a potential
    bias in reviewers’ recommendations induced by this information. In this work, we investigate whether
    reviewers exhibit a bias caused by the knowledge that the submission under review was previously rejected
    at a similar venue, focusing on a population of novice reviewers who constitute a large fraction of the
    reviewer pool in leading machine learning and computer science conferences. We design and conduct a
    randomized controlled trial closely replicating the relevant components of the peer-review pipeline with
    133 reviewers (master’s, junior PhD students, and recent graduates of top US universities) writing reviews
    for 19 papers. The analysis reveals that reviewers indeed become negatively biased when they receive a
    signal about paper being a resubmission, giving almost 1 point lower overall score on a 10-point Likert
    item (∆ = −0.78, 95\% CI = [−1.30, −0.24]) than reviewers who do not receive such a signal. Looking at
    specific criteria scores (originality, quality, clarity and significance), we observe that novice reviewers tend
    to underrate quality the most.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume21resubmit},
}
@inproceedings{daume21stereotypes,
  title     = {Analyzing Stereotypes in Generative Text Inference Tasks},
  author    = {Anna Sotnikova and Yang Trista Cao and Hal {Daum\'e III} and Rachel Rudinger},
  booktitle = {ACL (findings)},
  year      = {2021},
  abstract  = {
    Stereotypes are inferences drawn about people
    based on their demographic attributes, which
    may result in harms to users when a system
    is deployed. In generative language-inference
    tasks, given a premise, a model produces
    plausible hypotheses that follow either logically (natural language inference) or commonsensically (commonsense inference). Such
    tasks are therefore a fruitful setting in which
    to explore the degree to which NLP systems
    encode stereotypes. In our work, we study
    how stereotypes manifest when the potential
    targets of stereotypes are situated in real-life,
    neutral contexts. We collect human judgments
    on the presence of stereotypes in generated
    inferences, and compare how perceptions of
    stereotypes vary due to annotator positionality
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume21stereotypes},
}
@inproceedings{daume21woe,
  title     = {From Human Explanation to Model Interpretability: A Framework Based on Weight of Evidence},
  author    = {David Alvarez-Melis and Harmanpreet Kaur and Hal {Daum\'e III} and Hanna Wallach and Jennifer Wortman Vaughan},
  booktitle = {HCOMP},
  year      = {2021},
  abstract  = {
    We take inspiration from the study of human explanation toinform the design and evaluation of interpretability methodsin machine learning. First, we survey the literature on humanexplanation in philosophy, cognitive science, and the socialsciences, and propose a list of design principles for machine-generated explanations that are meaningful to humans. Usingthe concept of weight of evidence from information theory,we develop a method for generating explanations that adhereto these principles. We show that this method can be adaptedto  handle  high-dimensional,  multi-class  settings,  yielding  aflexible framework for generating explanations. We demon-strate  that  these  explanations  can  be  estimated  accuratelyfrom finite samples and are robust to small perturbations ofthe inputs. We also evaluate our method through a qualita-tive  user  study  with  machine  learning  practitioners,  wherewe  observe  that  the  resulting  explanations  are  usable  de-spite some participants struggling with background conceptslike prior class probabilities. Finally, we conclude by surfac-ing design implications for interpretability tools in general.
  },
  keywords  = {ml,hci},
  url       = {http://pub.hal3.name/#daume21woe},
}
@inproceedings{daume22ask,
  title     = {Learning When and What to Ask: a Hierarchical Reinforcement Learning Framework},
  author    = {Khanh Nguyen and Yonatan Bisk and Daum\'e, III, Hal},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2022},
  abstract  = {
    Reliable AI agents should be mindful of the limits of their knowledge and consult humans when sensing that they do not have sufficient knowledge to make sound decisions. We formulate a hierarchical reinforcement learning framework for learning to decide when to request additional information from humans and what type of information would be helpful to request. Our framework extends partially-observed Markov decision processes (POMDPs) by allowing an agent to interact with an assistant to leverage their knowledge in accomplishing tasks. Results on a simulated human-assisted navigation problem demonstrate the effectiveness of our framework: aided with an interaction policy learned by our method, a navigation policy achieves up to a 7x improvement in task success rate compared to performing tasks only by itself. The interaction policy is also efficient: on average, only a quarter of all actions taken during a task execution are requests for information. We analyze benefits and challenges of learning with a hierarchical policy structure and suggest directions for future work.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume22ask},
}
@article{daume22coref,
  title     = {Toward Gender-Inclusive Coreference Resolution: An Analysis of Gender and Bias Throughout the Machine Learning Lifecycle},
  author    = {Yang Trista Cao and Daum\'e, III, Hal},
  journal   = {Computational Linguistics},
  year      = {2022},
  abstract  = {
    Correctly resolving textual mentions of people fundamentally entails making inferences about those people. Such inferences raise the risk of systematic biases in coreference resolution systems, including biases that can harm binary and non-binary trans and cis stakeholders. To better understand such biases, we foreground nuanced conceptualizations of gender from sociology and sociolinguistics, and investigate where in the machine learning pipeline such biases can enter a coreference resolution system. We inspect many existing data sets for trans-exclusionary biases, and develop two new data sets for interrogating bias in both crowd annotations and in existing coreference resolution systems. Through these studies, conducted on English text, we confirm that without acknowledging and building systems that recognize the complexity of gender, we will build systems that fail for: quality of service …
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume22coref},
}
@inproceedings{daume22hcxai,
  title     = {Human-Centered Explainable AI (HCXAI): beyond opening the black-box of AI},
  author    = {Upol Ehsan and Philipp Wintersberger and Q Vera Liao and Elizabeth Anne Watkins and Carina Manger and Daum\'e, III, Hal and Andreas Riener and Mark O Riedl},
  booktitle = {CHI},
  year      = {2022},
  abstract  = {
    Explainability of AI systems is crucial to hold them accountable because they are increasingly becoming consequential in our lives by powering high-stakes decisions in domains like healthcare and law. When it comes to Explainable AI (XAI), understanding who interacts with the black-box of AI is just as important as “opening” it, if not more. Yet the discourse of XAI has been predominantly centered around the black-box, suffering from deficiencies in meeting user needs and exacerbating issues of algorithmic opacity. To address these issues, researchers have called for human-centered approaches to XAI. In this second CHI workshop on Human-centered XAI (HCXAI), we build on the success of the first installment from CHI 2021 to expand the conversation around XAI. We chart the domain and shape the HCXAI discourse with reflective discussions from diverse stakeholders. The goal of the second installment is to …
  },
  keywords  = {hci},
  url       = {http://pub.hal3.name/#daume22hcxai},
}
@article{daume22hstm,
  title     = {Heterogeneous Supervised Topic Models},
  author    = {Dhanya Sridhar and Daum\'e, III, Hal and David Blei},
  journal   = {TACL},
  year      = {2022},
  abstract  = {
    Researchers in the social sciences are often
    interested in the relationship between
    text and an outcome of interest, where the
    goal is to both uncover latent patterns in
    the text and predict outcomes for unseen
    texts. To this end, this paper develops
    the heterogeneous supervised topic models
    (HSTM), a probabilistic approach to text
    analysis and prediction. HSTMs posit a
    joint model of text and outcomes to find
    heterogeneous patterns that help with both
    text analysis and prediction. The main benefit
    of HSTMs is that they capture heterogeneity
    in the relationship between text and
    the outcome across latent topics. To fit
    HSTMs, we develop a variational inference
    algorithm based on the auto-encoding variational
    Bayes framework. We study the performance
    of HSTMs on eight datasets and
    find that they consistently outperform related
    methods, including fine-tuned black
    box models. Finally, we apply HSTMs to
    analyze news articles labeled with pro- or
    anti-tone. We find evidence of differing language
    used to signal a pro- and anti-tone.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume22hstm},
}
@inproceedings{daume22nlg,
  title     = {Deconstructing NLG Evaluation: Evaluation Practices, Assumptions, and Their Implications},
  author    = {Kaitlyn Zhou and Su Lin Blodgett and Adam Trischler and Daum\'e, III, Hal and Kaheer Suleman and Alexandra Olteanu},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2022},
  abstract  = {
    There are many ways to express similar things in text, which makes evaluating natural language generation (NLG) systems difficult. Compounding this difficulty is the need to assess varying quality criteria depending on the deployment setting. While the landscape of NLG evaluation has been well-mapped, practitioners' goals, assumptions, and constraints -- which inform decisions about what, when, and how to evaluate -- are often partially or implicitly stated, or not stated at all. Combining a formative semi-structured interview study of NLG practitioners (N=18) with a survey study of a broader sample of practitioners (N=61), we surface goals, community practices, assumptions, and constraints that shape NLG evaluations, examining their implications and how they embody ethical considerations.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume22nlg},
}
@inproceedings{daume22panda,
  title     = {Promoting Fairness in Learned Models by Learning to Active Learn under Parity Constraints},
  author    = {Amr Sharaf and Daum\'e, III, Hal},
  booktitle = {FAccT},
  year      = {2022},
  abstract  = {
    Machine learning models can have consequential effects when used
    to automate decisions, and disparities between groups of people
    in the error rates of those decisions can lead to harms suffered
    more by some groups than others. Past algorithmic approaches aim
    to enforce parity across groups given a fixed set of training data;
    instead, we ask: what if we can gather more data to mitigate disparities? We develop a meta-learning algorithm for parity-constrained
    active learning that learns a policy to decide which labels to query
    so as to maximize accuracy subject to parity constraints. To optimize the active learning policy, our proposed algorithm formulates
    the parity-constrained active learning task as a bi-level optimization problem. The inner level corresponds to training a classifier
    on a subset of labeled examples. The outer level corresponds to
    updating the selection policy choosing this subset to achieve a desired fairness and accuracy behavior on the trained classifier. To
    solve this constrained bi-level optimization problem, we employ
    the Forward-Backward Splitting optimization method. Empirically,
    across several parity metrics and classification tasks, our approach
    outperforms alternatives by a large margin.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume22panda},
}
@inproceedings{daume22request,
  title     = {A framework for learning to request rich and contextually useful information from humans},
  author    = {Khanh Nguyen and Yonatan Bisk and Daum\'e, III, Hal},
  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},
  year      = {2022},
  abstract  = {
    When deployed, AI agents will encounter
    problems that are beyond their autonomous
    problem-solving capabilities. Leveraging human
    assistance can help agents overcome their inherent limitations and robustly cope with unfamiliar
    situations. We present a general interactive
    framework that enables an agent to request and
    interpret rich, contextually useful information
    from an assistant that has knowledge about the
    task and the environment. We demonstrate the
    practicality of our framework on a simulated
    human-assisted navigation problem. Aided with
    an assistance-requesting policy learned by our
    method, a navigation agent achieves up to a 7×
    improvement in success rate on tasks that take
    place in previously unseen environments, compared to fully autonomous behavior. We show that
    the agent can take advantage of different types
    of information depending on the context, and
    analyze the benefits and challenges of learning the
    assistance-requesting policy when the assistant
    can recursively decompose tasks into subtasks.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume22request},
}
@inproceedings{daume22spoken,
  title     = {Spoken language interaction with robots: Recommendations for future research},
  author    = {Matthew Marge and Carol Espy-Wilson and Nigel G Ward and Abeer Alwan and Yoav Artzi and Mohit Bansal and Gil Blankenship and Joyce Chai and Daum\'e, III, Hal and Debadeepta Dey and Mary Harper and Thomas Howard and Casey Kennington and Ivana Kruijff-Korbayová and Dinesh Manocha and Cynthia Matuszek and Ross Mead and Raymond Mooney and Roger K Moore and Mari Ostendorf and Heather Pon-Barry and Alexander I Rudnicky and Matthias Scheutz and Robert St Amant and Tong Sun and Stefanie Tellex and David Traum and Zhou Yu},
  booktitle = {Computer Speech and Language},
  year      = {2022},
  abstract  = {
    With robotics rapidly advancing, more effective human–robot interaction is increasingly needed to realize the full potential of robots for society. While spoken language must be part of the solution, our ability to provide spoken language interaction capabilities is still very limited. In this article, based on the report of an interdisciplinary workshop convened by the National Science Foundation, we identify key scientific and engineering advances needed to enable effective spoken language interaction with robotics. We make 25 recommendations, involving eight general themes: putting human needs first, better modeling the social and interactive aspects of language, improving robustness, creating new methods for rapid adaptation, better integrating speech and language with other communication modalities, giving speech and language components access to rich representations of the robot’s current knowledge and state, making all components operate in real time, and improving research infrastructure and resources. Research and development that prioritizes these topics will, we believe, provide a solid foundation for the creation of speech-capable robots that are easy and effective for humans to work with.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume22spoken},
}
@inproceedings{daume22stereotypes,
  title     = {Theory-Grounded Measurement of U.S. Social Stereotypes in English Language Models},
  author    = {Yang Trista Cao and Anna Sotnikova and Daum\'e, III, Hal and Rachel Rudinger and Linda Zou},
  booktitle = {Proceedings of the Conference of the North American Chapter of the Association for Computational Linguistics (NAACL)},
  year      = {2022},
  abstract  = {
    NLP models trained on text have been shown
    to reproduce human stereotypes, which can
    magnify harms to marginalized groups when
    systems are deployed at scale. We adapt the
    Agency-Belief-Communion (ABC) stereotype
    model of Koch et al. (2016) from social psychology
    as a framework for the systematic
    study and discovery of stereotypic group-trait
    associations in language models (LMs). We
    introduce the sensitivity test (SeT) for measuring
    stereotypical associations from language
    models. To evaluate SeT and other measures
    using the ABC model, we collect group-trait
    judgments from U.S.-based subjects to compare
    with English LM stereotypes. Finally, we
    extend this framework to measure LM stereotyping
    of intersectional identities.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume22stereotypes},
}
@inproceedings{daume22vqa,
  title     = {What's Different between Visual Question Answering for Machine "Understanding" Versus for Accessibility?},
  author    = {Yang Trista Cao and Kyle Seelman and Kyungjun Lee and Daum\'e, III, Hal},
  booktitle = {AACL-IJCNLP},
  year      = {2022},
  abstract  = {
    In visual question answering (VQA), a machine must answer a question given an associated image. Recently, accessibility researchers
    have explored whether VQA can be deployed
    in a real-world setting where users with visual
    impairments learn about their environment by
    capturing their visual surroundings and asking questions. However, most of the existing
    benchmarking datasets for VQA focus on machine “understanding” and it remains unclear
    how progress on those datasets corresponds to
    improvements in this real-world use case. We
    aim to answer this question by evaluating discrepancies between machine “understanding”
    datasets (VQA-v2) and accessibility datasets
    (VizWiz) by evaluating a variety of VQA models. Based on our findings, we discuss opportunities and challenges in VQA for accessibility
    and suggest directions for future work.
  },
  keywords  = {nlp},
  award = {Best Theme Paper},
  url       = {http://pub.hal3.name/#daume22vqa},
}
@inproceedings{daume23aslcitizen,
  title     = {ASL Citizen: A Community-Sourced Dataset for Advancing Isolated Sign Language Recognition},
  author    = {Aashaka Desai and Lauren Berger and Fyodor O. Minakov and Vanessa Milan and Chinmay Singh and Kriston Pumphrey and Richard E. Ladner and Daum\'e, III, Hal and Alex X. Lu and Naomi Caselli and Danielle Bragg},
  booktitle = {NeurIPS (Data \& Benchmarks track)},
  year      = {2023},
  abstract  = {
    Sign languages are used as a primary language by approximately 70 million
    D/deaf people world-wide. However, most communication technologies operate
    in spoken and written languages, creating inequities in access. To help tackle this
    problem, we release ASL Citizen, the largest Isolated Sign Language Recognition
    (ISLR) dataset to date, collected with consent and containing 83,912 videos for
    2,731 distinct signs filmed by 52 signers in a variety of environments. We propose
    that this dataset be used for sign language dictionary retrieval for American Sign
    Language (ASL), where a user demonstrates a sign to their own webcam with
    the aim of retrieving matching signs from a dictionary. We show that training
    supervised machine learning classifiers with our dataset greatly advances the
    state-of-the-art on metrics relevant for dictionary retrieval, achieving, for instance,
    62\% accuracy and a recall-at-10 of 90\%, evaluated entirely on videos of users who
    are not present in the training or validation sets.
  },
  keywords  = {accessibility, nlp},
  url       = {http://pub.hal3.name/#daume23aslcitizen},
}
@inproceedings{daume23background,
  title     = {What Else Do I Need to Know? The Effect of Background Information on Users' Reliance on QA Systems},
  author    = {Navita Goyal and Eleftheria Briakou and Amanda Liu and Connor Baumler and Claire Bonial and Jeffrey Micher and Clare R. Voss and Marine Carpuat and Daum\'e, III, Hal},
  booktitle = {EMNLP},
  year      = {2023},
  abstract  = {
    NLP systems have shown impressive performance at answering questions by retrieving relevant context. However, with the increasingly large models, it is impossible and often undesirable to constrain models' knowledge or reasoning to only the retrieved context. This leads to a mismatch between the information that the models access to derive the answer and the information that is available to the user to assess the model predicted answer. In this work, we study how users interact with QA systems in the absence of sufficient information to assess their predictions. Further, we ask whether adding the requisite background helps mitigate users' over-reliance on predictions. Our study reveals that users rely on model predictions even in the absence of sufficient information needed to assess the model's correctness. Providing the relevant background, however, helps users better catch model errors, reducing over-reliance on incorrect predictions. On the flip side, background information also increases users' confidence in their accurate as well as inaccurate judgments. Our work highlights that supporting users' verification of QA predictions is an important, yet challenging, problem.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23background},
}
@inproceedings{daume23ceil,
  title     = {Progressively Efficient Learning},
  author    = {Ruijie Zheng and Khanh Nguyen and Daum\'e, III, Hal and Furong Huang and Karthik Narasimhan},
  booktitle = {Preprint},
  year      = {2023},
  abstract  = {
    Assistant AI agents should be capable of rapidly acquiring novel skills and adapting to new user preferences. Traditional frameworks like imitation learning and reinforcement learning do not facilitate this capability because they support only low-level, inefficient forms of communication. In contrast, humans communicate with progressive efficiency by defining and sharing abstract intentions. Reproducing similar capability in AI agents, we develop a novel learning framework named Communication-Efficient Interactive Learning (CEIL). By equipping a learning agent with an abstract, dynamic language and an intrinsic motivation to learn with minimal communication effort, CEIL leads to emergence of a human-like pattern where the learner and the teacher communicate progressively efficiently by exchanging increasingly more abstract intentions. CEIL demonstrates impressive performance and communication efficiency on a 2D MineCraft domain featuring long-horizon decision-making tasks. Agents trained with CEIL quickly master new tasks, outperforming non-hierarchical and hierarchical imitation learning by up to 50% and 20% in absolute success rate, respectively, given the same number of interactions with the teacher. Especially, the framework performs robustly with teachers modeled after human pragmatic communication behavior.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume23ceil},
}
@inproceedings{daume23cognitive,
  title     = {Define, Evaluate, and Improve Task-Oriented Cognitive Capabilities for Instruction Generation},
  author    = {Lingjun Zhao and Khanh Nguyen and Daum\'e, III, Hal},
  booktitle = {ACL},
  year      = {2023},
  abstract  = {
    Recent work studies the cognitive capabilities
    of language models through psychological tests
    designed for humans. While these studies are
    helpful for understanding the general capabilities of these models, there is no guarantee that a
    model possessing sufficient capabilities to pass
    those tests would actually use those capabilities
    in performing real-life tasks. In this work, we
    formulate task-oriented cognitive capabilities,
    which are human-like cognitive capabilities
    that language models leverage to perform
    tasks. These capabilities are (i) the ability to
    quickly generate good candidate utterances
    (the search capability) (ii) the ability to predict
    how a listener interprets those utterances and
    choose the most appropriate one (the pragmatic
    capability). We design an evaluation scheme
    for comparing these capabilities of a language
    model with those of a human. Applying
    this scheme to examine various models in a
    navigation instruction generation problem, we
    find that their pragmatic capability is severely
    lacking. This insight leads us to augment them
    with better models of the listener and obtain
    a significant boost of 11\% in success rate in
    guiding real humans. Our work advocates for
    having a principled procedure for aligning
    language models with humans that involves
    (i) formulating task-oriented capabilities, (ii)
    devising a method to quantify their deficiency,
    and (iii) iteratively improving them.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23cognitive},
}
@inproceedings{daume23conceptualizations,
  title     = {It Takes Two to Tango: Navigating Conceptualizations of NLP Tasks and Measurements of Performance},
  author    = {Arjun Subramonian and Xingdi Yuan and Daum\'e, III, Hal and Su Lin Blodgett},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2023},
  abstract  = {
    Progress in NLP is increasingly measured through benchmarks; hence, contextualizing progress requires understanding when and why practitioners may disagree about the validity of benchmarks. We develop a taxonomy of disagreement, drawing on tools from measurement modeling, and distinguish between two types of disagreement: 1) how tasks are conceptualized and 2) how measurements of model performance are operationalized. To provide evidence for our taxonomy, we conduct a meta-analysis of relevant literature to understand how NLP tasks are conceptualized, as well as a survey of practitioners about their impressions of different factors that affect benchmark validity. Our meta-analysis and survey across eight tasks, ranging from coreference resolution to question answering, uncover that tasks are generally not clearly and consistently conceptualized and benchmarks suffer from operationalization disagreements. These findings support our proposed taxonomy of disagreement. Finally, based on our taxonomy, we present a framework for constructing benchmarks and documenting their limitations.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23conceptualizations},
}
@inproceedings{daume23daal,
  title     = {Which Examples Should be Multiply Annotated? Active Learning When Annotators May Disagree},
  author    = {Connor Baumler and Anna Sotnikova and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2023},
  abstract  = {
Linguistic annotations, especially for controver-
sial topics like hate speech detection, are fre-
quently contested due to annotator backgrounds
and positionalities. In such situations, pre-
serving this disagreement through the machine
learning pipeline can be important for down-
stream use cases. However, capturing disagree-
ment can increase annotation time and expense.
Fortunately, for many tasks, not all examples
are equally controversial; we develop an ac-
tive learning approach, Disagreement Aware
Active Learning (DAAL) that concentrates an-
notations on examples where model entropy
and annotator entropy are the most different.
Because we cannot know the true entropy of an-
notations on unlabeled examples, we estimate
a model that predicts annotator entropy trained
using very few multiply-labeled examples. We
find that traditional uncertainty-based active
learning underperforms simple passive learn-
ing on tasks with high levels of disagreement,
but that our active learning approach is able to
successfully improve on passive learning, re-
ducing the number of annotations required by
at least 24\% on average across several datasets.
},
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23daal},
}
@inproceedings{daume23drm,
  title     = {DrM: Mastering Visual Reinforcement Learning through Dormant Ratio Minimization},
  author    = {Guowei Xu and Ruijie Zheng and Yongyuan Liang and Xiyao Wang and Zhecheng Yuan and Tianying Ji and Yu Luo and Xiaoyu Liu and Jiaxin Yuan and Pu Hua and Shuzhen Li and Yanjie Ze and Daum\'e, III, Hal and Furong Huang and Huazhe Xu},
  booktitle = {Preprint},
  year      = {2023},
  abstract  = {
    Visual reinforcement learning (RL) has shown promise in continuous control tasks.
    Despite its progress, current algorithms are still unsatisfactory in virtually every
    aspect of the performance such as sample efficiency, asymptotic performance, and
    their robustness to the choice of random seeds. In this paper, we identify a major
    shortcoming in existing visual RL methods that is the agents often exhibit sustained
    inactivity during early training, thereby limiting their ability to explore effectively.
    Expanding upon this crucial observation, we additionally unveil a significant correlation between the agents’ inclination towards motorically inactive exploration and
    the absence of neuronal activity within their policy networks. To quantify this inactivity, we adopt dormant ratio (Sokar et al., 2023) as a metric to measure inactivity
    in the RL agent’s network. Empirically, we also recognize that the dormant ratio
    can act as a standalone indicator of an agent’s activity level, regardless of the received reward signals. Leveraging the aforementioned insights, we introduce DrM ,
    a method that uses three core mechanisms to guide agents’ exploration-exploitation
    trade-offs by actively minimizing the dormant ratio. Experiments demonstrate that
    DrM achieves significant improvements in sample efficiency and asymptotic performance with no broken seeds (76 seeds in total) across three continuous control
    benchmark environments, including DeepMind Control Suite, MetaWorld, and
    Adroit. Most importantly, DrM is the first model-free algorithm that consistently
    solves tasks in both the Dog and Manipulator domains from the DeepMind Control
    Suite as well as three dexterous hand manipulation tasks without demonstrations in
    Adroit, all based on pixel observations.
  },
  keywords  = {ml},
  url       = {http://pub.hal3.name/#daume23drm},
}
@inproceedings{daume23factual,
  title     = {Factual or Contextual? Disentangling Error Types in Entity Description Generation},
  author    = {Navita Goyal and Ani Nenkova and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2023},
  abstract  = {
    In the task of entity description generation,
    given a context and a specified entity, a model
    must describe that entity correctly and in a
    contextually-relevant way. In this task, as well
    as broader language generation tasks, the generation of a nonfactual description (factual error)
    versus an incongruous description (contextual
    error) is fundamentally different, yet often conflated. We develop an evaluation paradigm that
    enables us to disentangle these two types of
    errors in naturally occurring textual contexts.
    We find that factuality and congruity are often
    at odds, and that models specifically struggle
    with accurate descriptions of entities that are
    less familiar to people. This shortcoming of
    language models raises concerns around the
    trustworthiness of such models, since factual
    errors on less well-known entities are exactly
    those that a human reader will not recognize.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23factual},
}
@inproceedings{daume23fairexp,
  title     = {Towards Conceptualization of "Fair Explanation": Disparate Impacts of anti-Asian Hate Speech Explanations on Content Moderators},
  author    = {Tin Nguyen and Jiannan Xu and Aayushi Roy and Daum\'e, III, Hal and Marine Carpuat},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2023},
  abstract  = {
    Recent research at the intersection of AI explainability and fairness has focused on how explanations can improve human-plus-AI task performance as assessed by fairness measures. We propose to characterize what constitutes an explanation that is itself "fair" -- an explanation that does not adversely impact specific populations. We formulate a novel evaluation method of "fair explanations" using not just accuracy and label time, but also psychological impact of explanations on different user groups across many metrics (mental discomfort, stereotype activation, and perceived workload). We apply this method in the context of content moderation of potential hate speech, and its differential impact on Asian vs. non-Asian proxy moderators, across explanation approaches (saliency map and counterfactual explanation). We find that saliency maps generally perform better and show less evidence of disparate impact (group) and individual unfairness than counterfactual explanations.
  },
  keywords  = {ex},
  url       = {http://pub.hal3.name/#daume23fairexp},
}
@inproceedings{daume23fairprism,
  title     = {FairPrism: Evaluating Fairness-Related Harms in Text Generation},
  author    = {Eve Fleisig and Aubrie Amstutz and Chad Atalla and Su Lin Blodgett and Daum\'e, III, Hal and Alexandra Olteanu and Emily Sheng and Dan Vann and Hanna Wallach},
  booktitle = {Proceedings of the Conference of the Association for Computational Linguistics (ACL)},
  year      = {2023},
  abstract  = {
    It is critical to measure and mitigate fairness-
    related harms caused by AI text generation
    systems, including stereotyping and demeaning
    harms. To that end, we introduce FairPrism,
    a dataset of 5,000 examples of AI-generated
    English text with detailed human annotations
    covering a diverse set of harms relating to
    gender and sexuality. FairPrism aims to
    address several limitations of existing datasets
    for measuring and mitigating fairness-related
    harms, including improved transparency,
    clearer specification of dataset coverage, and
    accounting for annotator disagreement and
    harms that are context-dependent. FairPrism’s
    annotations include the extent of stereotyping
    and demeaning harms, the demographic groups
    targeted, and appropriateness for different
    applications. The annotations also include
    specific harms that occur in interactive contexts
    and harms that raise normative concerns when
    the “speaker” is an AI system. Due to its
    precision and granularity, FairPrism can be
    used to diagnose (1) the types of fairness-
    related harms that AI text generation systems
    cause, and (2) the potential limitations of
    mitigation methods, both of which we illustrate
    through case studies. Finally, the process we
    followed to develop FairPrism offers a recipe
    for building improved datasets for measuring
    and mitigating harms caused by AI systems.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23fairprism},
}
@inproceedings{daume23hallucination,
  title     = {Hallucination Detection for Grounded Instruction Generation},
  author    = {Lingjun Zhao and Khanh Nguyen and Daum\'e, III, Hal},
  booktitle = {EMNLP (Findings)},
  year      = {2023},
  abstract  = {
    We investigate the problem of generating instructions to guide humans to navigate in simulated residential environments. A major issue with current models is hallucination: they generate references to actions or objects that are inconsistent with what a human follower would perform or encounter along the described path. We develop a model that detects these hallucinated references by adopting a model pre-trained on a large corpus of image-text pairs, and fine-tuning it with a contrastive loss that separates correct instructions from instructions containing synthesized hallucinations. Our final model outperforms several baselines, including using word probability estimated by the instruction-generation model, and supervised models based on LSTM and Transformer.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23hallucination},
}
@inproceedings{daume23impact,
  title     = {Evaluating the Social Impact of Generative AI Systems in Systems and Society},
  author    = {Irene Solaiman and Zeerak Talat and William Agnew and Lama Ahmad and Dylan Baker and Su Lin Blodgett and Daum\'e, III, Hal and Jesse Dodge and Ellie Evans and Sara Hooker and Yacine Jernite and Alexandra Sasha Luccioni and Alberto Lusoli and Margaret Mitchell and Jessica Newman and Marie-Therese Png and Andrew Strait and Aposotol Vassilev},
  booktitle = {Preprint},
  year      = {2023},
  abstract  = {
    Generative AI systems across modalities, ranging from text, image, audio, and
    video, have broad social impacts, but there exists no official standard for
    means of evaluating those impacts and which impacts should be evaluated. We
    move toward a standard approach in evaluating a generative AI system for any
    modality, in two overarching categories: what is able to be evaluated in a base
    system that has no predetermined application and what is able to be evaluated
    in society. We describe specific social impact categories and how to approach
    and conduct evaluations in the base technical system, then in people and
    society. Our framework for a base system defines seven categories of social
    impact: bias, stereotypes, and representational harms; cultural values and
    sensitive content; disparate performance; privacy and data protection;
    financial costs; environmental costs; and data and content moderation labor
    costs. Suggested methods for evaluation apply to all modalities and analyses of
    the limitations of existing evaluations serve as a starting point for necessary
    investment in future evaluations. We offer five overarching categories for what
    is able to be evaluated in society, each with their own subcategories:
    trustworthiness and autonomy; inequality, marginalization, and violence;
    concentration of authority; labor and creativity; and ecosystem and
    environment. Each subcategory includes recommendations for mitigating harm. We
    are concurrently crafting an evaluation repository for the AI research
    community to contribute existing evaluations along the given categories. This
    version will be updated following a CRAFT session at ACM FAccT 2023.
  },
  keywords  = {ml, fairness},
  url       = {http://pub.hal3.name/#daume23impact},
}
@inproceedings{daume23proxy,
  title     = {The Impact of Explanations on Fairness in Human-AI Decision-Making: Protected vs Proxy Features},
  author    = {Navita Goyal and Connor Baumler and Tin Nguyen and Daum\'e, III, Hal},
  booktitle = {IUI},
  year      = {2024},
  abstract  = {
    AI systems have been known to amplify biases in real world data. Explanations may help human-AI teams address these biases for fairer decision-making. Typically, explanations focus on salient input features. If a model is biased against some protected group, explanations may include features that demonstrate this bias, but when biases are realized through proxy features, the relationship between this proxy feature and the protected one may be less clear to a human. In this work, we study the effect of the presence of protected and proxy features on participants' perception of model fairness and their ability to improve demographic parity over an AI alone. Further, we examine how different treatments -- explanations, model bias disclosure and proxy correlation disclosure -- affect fairness perception and parity. We find that explanations help people detect direct biases but not indirect biases. Additionally, regardless of bias type, explanations tend to increase agreement with model biases. Disclosures can help mitigate this effect for indirect biases, improving both unfairness recognition and the decision-making fairness. We hope that our findings can help guide further research into advancing explanations in support of fair human-AI decision-making.
  },
  keywords  = {exp},
  url       = {http://pub.hal3.name/#daume23proxy},
}
@inproceedings{daume23rose,
  title     = {A Rose by Any Other Name would not Smell as Sweet: Social Bias in Name Mistranslations},
  author    = {Sandra Sandoval and Jieyu Zhao and Marine Carpuat and Daum\'e, III, Hal},
  booktitle = {Proceedings of the Conference on Empirical Methods in Natural Language Processing (EMNLP)},
  year      = {2023},
  abstract  = {
    We ask the question: Are there widespread
    disparities in machine translations of names
    across race/ethnicity, and gender? We hypothesize that the translation quality of names and
    surrounding context will be lower for names
    associated with US racial and ethnic minorities
    due to these systems’ tendencies to standardize language to predominant language patterns.
    We develop a dataset of names that are strongly
    demographically aligned and propose a translation evaluation procedure based on round-trip
    translation. We analyze the effect of name demographics on translation quality using generalized linear mixed effects models and find that
    the ability of translation systems to correctly
    translate female-associated names is significantly lower than male-associated names. This
    effect is particularly pronounced for femaleassociated names that are also associated with
    racial (Black) and ethnic (Hispanic) minorities.
    This disparity in translation quality between social groups for something as personal as someone’s name has significant implications for people’s professional, personal and cultural identities, self-worth and ease of communication.
    Our findings suggest that more MT research
    is needed to improve the translation of names
    and to provide high-quality service for users
    regardless of gender, race, and ethnicity.
  },
  keywords  = {nlp},
  url       = {http://pub.hal3.name/#daume23rose},
}
@inproceedings{daume23taco,
  title     = {TACO: Temporal Latent Action-Driven Contrastive Loss for Visual Reinforcement Learning},
  author    = {Ruijie Zheng and Xiyao Wang and Yanchao Sun and Shuang Ma and Jieyu Zhao and Huazhe Xu and Daum\'e, III, Hal and Furong Huang},
  booktitle = {NeurIPS},
  year      = {2023},
  abstract  = {
    Despite recent progress in reinforcement learning (RL) from raw pixel data, sample inefficiency continues to present a substantial obstacle. Prior works have attempted to address this challenge by creating self-supervised auxiliary tasks, aiming to enrich the agent's learned representations with control-relevant information for future state prediction. However, these objectives are often insufficient to learn representations that can represent the optimal policy or value function, and they often consider tasks with small, abstract discrete action spaces and thus overlook the importance of action representation learning in continuous control. In this paper, we introduce TACO: Temporal Action-driven Contrastive Learning, a simple yet powerful temporal contrastive learning approach that facilitates the concurrent acquisition of latent state and action representations for agents. TACO simultaneously learns a state and an action representation by optimizing the mutual information between representations of current states paired with action sequences and representations of the corresponding future states. Theoretically, TACO can be shown to learn state and action representations that encompass sufficient information for control, thereby improving sample efficiency. For online RL, TACO achieves 40% performance boost after one million environment interaction steps on average across nine challenging visual continuous control tasks from Deepmind Control Suite. In addition, we show that TACO can also serve as a plug-and-play module adding to existing offline visual RL methods to establish the new state-of-the-art performance for offline visual RL across offline datasets with varying quality.
  },
  keywords  = {rl},
  url       = {http://pub.hal3.name/#daume23taco},
}
@inproceedings{daume23truthfulness,
  title     = {Large Language Models Help Humans Verify Truthfulness -- Except When They Are Convincingly Wrong},
  author    = {Chenglei Si and Navita Goyal and Sherry Tongshuang Wu and Chen Zhao and Shi Feng and Daum\'e, III, Hal and Jordan Boyd-Graber},
  booktitle = {Preprint},
  year      = {2023},
  abstract  = {
    Large Language Models (LLMs) are increasingly used for accessing information on the web. Their truthfulness and factuality are thus of great interest. To help users make the right decisions about the information they're getting, LLMs should not only provide but also help users fact-check information. In this paper, we conduct experiments with 80 crowdworkers in total to compare language models with search engines (information retrieval systems) at facilitating fact-checking by human users. We prompt LLMs to validate a given claim and provide corresponding explanations. Users reading LLM explanations are significantly more efficient than using search engines with similar accuracy. However, they tend to over-rely the LLMs when the explanation is wrong. To reduce over-reliance on LLMs, we ask LLMs to provide contrastive information - explain both why the claim is true and false, and then we present both sides of the explanation to users. This contrastive explanation mitigates users' over-reliance on LLMs, but cannot significantly outperform search engines. However, showing both search engine results and LLM explanations offers no complementary benefits as compared to search engines alone. Taken together, natural language explanations by LLMs may not be a reliable replacement for reading the retrieved passages yet, especially in high-stakes settings where over-relying on wrong AI explanations could lead to critical consequences.
  },
  keywords  = {llm},
  url       = {http://pub.hal3.name/#daume23truthfulness},
}
@inproceedings{daume24drm,
  title     = {DrM: Mastering Visual Reinforcement Learning through Dormant Ratio Minimization},
  author    = {Guowei Xu and Ruijie Zheng and Yongyuan Liang and Xiyao Wang and Zhecheng Yuan and Tianying Ji and Yu Luo and Xiaoyu Liu and Jiaxin Yuan and Pu Hua and Shuzhen Li and Yanjie Ze and Daum\'e, III, Hal and Furong Huang and Huazhe Xu},
  booktitle = {ICLR},
  year      = {2024},
  abstract  = {
    Visual reinforcement learning (RL) has shown promise in continuous control tasks. Despite its progress, current algorithms are still unsatisfactory in virtually every aspect of the performance such as sample efficiency, asymptotic performance, and their robustness to the choice of random seeds. In this paper, we identify a major shortcoming in existing visual RL methods that is the agents often exhibit sustained inactivity during early training, thereby limiting their ability to explore effectively. Expanding upon this crucial observation, we additionally unveil a significant correlation between the agents' inclination towards motorically inactive exploration and the absence of neuronal activity within their policy networks. To quantify this inactivity, we adopt dormant ratio as a metric to measure inactivity in the RL agent's network. Empirically, we also recognize that the dormant ratio can act as a standalone indicator of an agent's activity level, regardless of the received reward signals. Leveraging the aforementioned insights, we introduce DrM, a method that uses three core mechanisms to guide agents' exploration-exploitation trade-offs by actively minimizing the dormant ratio. Experiments demonstrate that DrM achieves significant improvements in sample efficiency and asymptotic performance with no broken seeds (76 seeds in total) across three continuous control benchmark environments, including DeepMind Control Suite, MetaWorld, and Adroit. Most importantly, DrM is the first model-free algorithm that consistently solves tasks in both the Dog and Manipulator domains from the DeepMind Control Suite as well as three dexterous hand manipulation tasks without demonstrations in Adroit, all based on pixel observations.
  },
  keywords  = {rl},
  url       = {http://pub.hal3.name/#daume24drm},
}