@inproceedings{gan-etal-2022-measuring, title = "Measuring and Improving Compositional Generalization in Text-to-{SQL} via Component Alignment", author = "Gan, Yujian and Chen, Xinyun and Huang, Qiuping and Purver, Matthew", booktitle = "Findings of the Association for Computational Linguistics: NAACL 2022", month = jul, year = "2022", address = "Seattle, United States", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.findings-naacl.62", doi = "10.18653/v1/2022.findings-naacl.62", pages = "831--843", abstract = "In text-to-SQL tasks {---} as in much of NLP {---} \textit{compositional generalization} is a major challenge: neural networks struggle with compositional generalization where training and test distributions differ. However, most recent attempts to improve this are based on word-level synthetic data or specific dataset splits to generate compositional biases. In this work, we propose a clause-level compositional example generation method. We first split the sentences in the Spider text-to-SQL dataset into sub-sentences, annotating each sub-sentence with its corresponding SQL clause, resulting in a new dataset Spider-SS. We then construct a further dataset, Spider-CG, by composing Spider-SS sub-sentences in different combinations, to test the ability of models to generalize compositionally. Experiments show that existing models suffer significant performance degradation when evaluated on Spider-CG, even though every sub-sentence is seen during training. To deal with this problem, we modify a number of state-of-the-art models to train on the segmented data of Spider-SS, and we show that this method improves the generalization performance.", } @inproceedings{gregoromichelaki-etal-2022-language, title = "Language and Cognition as Distributed Process and Interactions", author = "Gregoromichelaki, Eleni and Eshghi, Arash and Howes, Christine and Mills, Gregory and Kempson, Ruth and Hough, Julian and Healey, Patrick and Purver, Matthew", booktitle = "Proceedings of the 26th Workshop on the Semantics and Pragmatics of Dialogue - Full Papers", month = aug, year = "2022", address = "Dublin, Ireland", publisher = "SEMDIAL", url = "http://semdial.org/anthology/Z22-Gregoromichelaki_semdial_0018.pdf", }@InProceedings{KhareEtAl2021ICWSM, author = "Khare, Prashant and Karan, Mladen and McQuistin, Stephen and Perkins, Colin and Tyson, Gareth and Purver, Matthew and Healey, Patrick and Castro, Ignacio", title = "The Web We Weave: Untangling the Social Graph of the {IETF}", year = 2022, booktitle = "Proceedings of the 16th International {AAAI} Conference on Web and Social Media ({ICWSM})", location = "Atlanta, GA, USA", pages = "500-511", publisher = "AAAI Press", address = "Palo Alto, CA, USA", issn = "2334-0770", isbn = "978-1-57735-875-6", doi = "10.1609/icwsm.v16i1.19310", url = "https://doi.org/10.1609/icwsm.v16i1.19310", } @inproceedings{koloski-etal-2022-knowledge, title = "Knowledge informed sustainability detection from short financial texts", author = "Koloski, Boshko and Montariol, Syrielle and Purver, Matthew and Pollak, Senja", editor = "Chen, Chung-Chi and Huang, Hen-Hsen and Takamura, Hiroya and Chen, Hsin-Hsi", booktitle = "Proceedings of the Fourth Workshop on Financial Technology and Natural Language Processing (FinNLP)", month = dec, year = "2022", address = "Abu Dhabi, United Arab Emirates (Hybrid)", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.finnlp-1.31", doi = "10.18653/v1/2022.finnlp-1.31", pages = "228--234", abstract = "There is a global trend for responsible investing and the need for developing automated methods for analyzing and Environmental, Social and Governance (ESG) related elements in financial texts is raising. In this work we propose a solution to the FinSim4-ESG task, consisting of binary classification of sentences into sustainable or unsustainable. We propose a novel knowledge-based latent heterogeneous representation that is based on knowledge from taxonomies and knowledge graphs and multiple contemporary document representations. We hypothesize that an approach based on a combination of knowledge and document representations can introduce significant improvement over conventional document representation approaches. We consider ensembles on classifier as well on representation level late-fusion and early fusion. The proposed approaches achieve competitive accuracy of 89 and are 5.85 behind the best achieved score.", } @inproceedings{nakwijit-purver-2022-misspelling, title = "Misspelling Semantics in {T}hai", author = "Nakwijit, Pakawat and Purver, Matthew", booktitle = "Proceedings of the Thirteenth Language Resources and Evaluation Conference", month = jun, year = "2022", address = "Marseille, France", publisher = "European Language Resources Association", url = "https://aclanthology.org/2022.lrec-1.24", pages = "227--236", abstract = "User-generated content is full of misspellings. Rather than being just random noise, we hypothesise that many misspellings contain hidden semantics that can be leveraged for language understanding tasks. This paper presents a fine-grained annotated corpus of misspelling in Thai, together with an analysis of misspelling intention and its possible semantics to get a better understanding of the misspelling patterns observed in the corpus. In addition, we introduce two approaches to incorporate the semantics of misspelling: Misspelling Average Embedding (MAE) and Misspelling Semantic Tokens (MST). Experiments on a sentiment analysis task confirm our overall hypothesis: additional semantics from misspelling can boost the micro F1 score up to 0.4-2{\%}, while blindly normalising misspelling is harmful and suboptimal.", } @inproceedings{poesio-etal-2022-arciduca, title = "{ARCIDUCA}: Annotating Reference and Coreference In Dialogue Using Conversational Agents in games", author = "Poesio, Massimo and Bartle, Richard and Chamberlain, Jon and Hough, Julian and Madge, Chris and Perez-Llebana, Diego and Purver, Matt and Yu, Juntao", booktitle = "Proceedings of the 26th Workshop on the Semantics and Pragmatics of Dialogue - Poster Abstracts", month = aug, year = "2022", address = "Dublin, Ireland", publisher = "SEMDIAL", url = "http://semdial.org/anthology/Z22-Poesio_semdial_0036.pdf", } @inproceedings{purver-etal-2022-tracking, title = "Tracking Changes in {ESG} Representation: Initial Investigations in {UK} Annual Reports", author = "Purver, Matthew and Martinc, Matej and Ichev, Riste and Lon{\v{c}}arski, Igor and Sitar {\v{S}}u{\v{s}}tar, Katarina and Valentin{\v{c}}i{\v{c}}, Aljo{\v{s}}a and Pollak, Senja", booktitle = "Proceedings of the First Computing Social Responsibility Workshop within the 13th Language Resources and Evaluation Conference", editor = "Mingyu Wan and Chu-Ren Huang", month = jun, year = "2022", address = "Marseille, France", publisher = "European Language Resources Association", isbn = "979-10-95546-89-4", url = "https://aclanthology.org/2022.csrnlp-1.2", pages = "9--14", abstract = "We describe initial work into analysing the language used around environmental, social and governance (ESG) issues in UK company annual reports. We collect a dataset of annual reports from UK FTSE350 companies over the years 2012-2019; separately, we define a categorized list of core ESG terms (single words and multi-word expressions) by combining existing lists with manual annotation. We then show that this list can be used to analyse the changes in ESG language in the dataset over time, via a combination of language modelling and distributional modelling via contextual word embeddings. Initial findings show that while ESG discussion in annual reports is becoming significantly more likely over time, the increase varies with category and with individual terms, and that some terms show noticeable changes in usage.", } @inproceedings{shekhar-etal-2022-coral, title = "{C}o{RAL}: a Context-aware {C}roatian Abusive Language Dataset", author = "Shekhar, Ravi and Karan, Mladen and Purver, Matthew", booktitle = "Findings of the Association for Computational Linguistics: AACL-IJCNLP 2022", month = nov, year = "2022", address = "Online only", publisher = "Association for Computational Linguistics", doi = "10.5281/zenodo.7941535", isbn = "978-1-959429-04-3", annote = "ISBN 978-1-959429-04-3", url = "https://aclanthology.org/2022.findings-aacl.21", url = "https://arxiv.org/abs/2211.06053", pages = "217--225", abstract = "In light of unprecedented increases in the popularity of the internet and social media, comment moderation has never been a more relevant task. Semi-automated comment moderation systems greatly aid human moderators by either automatically classifying the examples or allowing the moderators to prioritize which comments to consider first. However, the concept of inappropriate content is often subjective, and such content can be conveyed in many subtle and indirect ways. In this work, we propose CoRAL {--} a language and culturally aware Croatian Abusive dataset covering phenomena of implicitness and reliance on local and global context. We show experimentally that current models degrade when comments are not explicit and further degrade when language skill and context knowledge are required to interpret the comment.", } @inproceedings{tran-etal-2022-jsi, title = "{JSI} at {S}em{E}val-2022 Task 1: {CODWOE} - Reverse Dictionary: Monolingual and cross-lingual approaches", author = "Tran, Thi Hong Hanh and Martinc, Matej and Purver, Matthew and Pollak, Senja", booktitle = "Proceedings of the 16th International Workshop on Semantic Evaluation (SemEval-2022)", month = jul, year = "2022", address = "Seattle, United States", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2022.semeval-1.12", doi = "10.18653/v1/2022.semeval-1.12", pages = "101--106", abstract = "The reverse dictionary task is a sequence-to-vector task in which a gloss is provided as input, and the output must be a semantically matching word vector. The reverse dictionary is useful in practical applications such as solving the tip-of-the-tongue problem, helping new language learners, etc. In this paper, we evaluate the effect of a Transformer-based model with cross-lingual zero-shot learning to improve the reverse dictionary performance. Our experiments are conducted in five languages in the CODWOE dataset, including English, French, Italian, Spanish, and Russian. Even if we did not achieve a good ranking in the CODWOE competition, we show that our work partially improves the current baseline from the organizers with a hypothesis on the impact of LSTM in monolingual, multilingual, and zero-shot learning. All the codes are available at https://github.com/honghanhh/codwoe2021.", } @InProceedings{WrightPurver2022TSD, author="Wright, George A. and Purver, Matthew", editor="Sojka, Petr and Hor{\'a}k, Ale{\v{s}} and Kope{\v{c}}ek, Ivan and Pala, Karel", title="A Self-Evaluating Architecture for Describing Data", booktitle="Text, Speech, and Dialogue", year=2022, publisher="Springer International Publishing", address="Cham", pages="187--198", isbn="978-3-031-16270-1", doi = "10.1007/978-3-031-16270-1_16", url = "https://doi.org/10.1007/978-3-031-16270-1_16", abstract="This paper introduces Linguoplotter, a workspace-based architecture for generating short natural language descriptions. All processes within Linguoplotter are carried out by codelets, small pieces of code each responsible for making incremental changes to the program's state, the idea of which is borrowed from Hofstadter et al. [6]. Codelets in Linguoplotter gradually transform a representation of temperatures on a map into a description which can be output. Many processes emerge in the program out of the actions of many codelets, including language generation, self-evaluation, and higher-level decisions such as when to stop a given process, and when to end all processing and publish a final text. The program outputs a piece of text along with a satisfaction score indicating how good the program judges the text to be. The iteration of the program described in this paper is capable of linguistically more diverse outputs than a previous version; human judges rate the outputs of this version more highly than those of the last; and there is some correlation between rankings by human judges and the program's own satisfaction score. But, the program still publishes disappointingly short and simple texts (despite being capable of longer, more complete descriptions). This paper describes: the workings of the program; a recent evaluation of its performance; and possible improvements for a future iteration.", }