@InProceedings{ CaporussoEtAl23SiKDD, title = "Compared to Us, They Are \ldots: An Exploration of Social Biases in {E}nglish and {I}talian Language Models Using Prompting and Sentiment Analysis", author = "Caporusso, Jaya and Pollak, Senja and Purver, Matthew", booktitle = "Proceedings of the {S}lovenian {KDD} Conference", editor = "Grobelnik, Marko and Mladeni{\'{c}}, Dunja", location = "Ljubljana, Slovenia", year = 2023, month = oct, pages = "33-38", isbn = "", url = "https://ailab.ijs.si/dunja/SiKDD2023/Papers/IS_2023_-_SIKDD_paper_10.pdf", url = "http://www.eecs.qmul.ac.uk/~mpurver/papers/caporusso-et-al23sikdd.pdf", } @inproceedings{gan-etal-2023-appraising, title = "Re-appraising the Schema Linking for Text-to-{SQL}", author = "Gan, Yujian and Chen, Xinyun and Purver, Matthew", editor = "Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki", booktitle = "Findings of the Association for Computational Linguistics: ACL 2023", month = jul, year = "2023", address = "Toronto, Canada", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.findings-acl.53", doi = "10.18653/v1/2023.findings-acl.53", pages = "835--852", abstract = "Most text-to-SQL models, even though based on the same grammar decoder, generate the SQL structure first and then fill in the SQL slots with the correct schema items. This second step depends on schema linking: aligning the entity references in the question with the schema columns or tables. This is generally approached via Exact Match based Schema Linking (EMSL) within a neural network-based schema linking module. EMSL has become standard in text-to-SQL: many state-of-the-art models employ EMSL, with performance dropping significantly when the EMSL component is removed. In this work, however, we show that EMSL reduces robustness, rendering models vulnerable to synonym substitution and typos. Instead of relying on EMSL to make up for deficiencies in question-schema encoding, we show that using a pre-trained language model as an encoder can improve performance without using EMSL, giving a more robust model. We also study the design choice of the schema linking module, finding that a suitable design benefits performance and interoperability. Finally, based on the above study of schema linking, we introduce the grammar linking to help model align grammar references in the question with the SQL keywords.", } @InProceedings{ GhinassiEtAl23ICME, author = "Ghinassi, Iacopo and Purver, Matthew and Phan, Huy and Newell, Chris", booktitle = "IEEE International Conference on Multimedia and Expo (ICME)", title = "Exploring Pre-Trained Neural Audio Representations for Audio Topic Segmentation", year = 2023, pages = "1086-1091", doi = "10.1109/ICME55011.2023.00190", url = "https://doi.org/10.1109/ICME55011.2023.00190", url = "https://doi.ieeecomputersociety.org/10.1109/ICME55011.2023.00190", publisher = "IEEE Computer Society", address = "Los Alamitos, CA", month = jul, abstract = {Recent works have shown that audio embeddings can improve automatic topic segmentation of formats such as radio shows. In this work we expand the work in that direction by showing how and which publicly available, pre-trained neural audio embeddings can perform the task, without the need of any further fine-tuning of the audio encoders. The ranking of the encoders suggest that neural encoders pre-trained for speaker diarization and general purpose audio classification are the best suited to be used as features, beating non-neural baselines. We show that we can obtain perfect results on a newly created random dataset similar to the one used in previous work. We also show for the first time results on real-world data, proving that our method can be applied to actual radio shows with good results, but the choice of audio encoders is extremely important in order to achieve those. Finally, by releasing the datasets we used we make the contribution of providing the first (to our knowledge) publicly available, free of charge datasets for audio topic segmentation of media products.}, keywords = {media;encoding;task analysis;digital audio broadcasting;synthetic data}, } @inproceedings{GhinassiEtAl2023ICMR, author = {Ghinassi, Iacopo and Wang, Lin and Newell, Chris and Purver, Matthew}, title = {Multimodal Topic Segmentation of Podcast Shows with Pre-Trained Neural Encoders}, year = 2023, isbn = {9798400701788}, publisher = {Association for Computing Machinery}, address = {New York, NY, USA}, url = {https://doi.org/10.1145/3591106.3592270}, doi = {10.1145/3591106.3592270}, abstract = {We present two multimodal models for topic segmentation of podcasts built on pre-trained neural text and audio embeddings. We show that results can be improved by combining different modalities; but also by combining different encoders from the same modality, especially general-purpose sentence embeddings with specifically fine-tuned ones. We also show that audio embeddings can be substituted with two simple features related to sentence duration and inter-sentential pauses with comparable results. Finally, we publicly release our two datasets, the first in our knowledge publicly and freely available multimodal datasets for topic segmentation.}, booktitle = {Proceedings of the 2023 ACM International Conference on Multimedia Retrieval}, pages = {602–606}, numpages = {5}, keywords = {multi-modal, topic segmentation}, location = {Thessaloniki, Greece}, series = {ICMR '23} } @Article{ GhinassiEtAl23PeerJ, title = "Comparing Neural Sentence Encoders for Topic Segmentation Across Domains: Not Your Typical Text Similarity Task", author = "Ghinassi, Iacopo and Wang, Lin and Newell, Chris and Purver, Matthew", year = 2023, journal = "{PeerJ} Computer Science", volume = 9, pages = "e1593", doi = "10.7717/peerj-cs.1593", url = "https://doi.org/10.7717/peerj-cs.1593", } @inproceedings{ghinassi-etal-2023-lessons, title = "Lessons Learnt from Linear Text Segmentation: a Fair Comparison of Architectural and Sentence Encoding Strategies for Successful Segmentation", author = "Ghinassi, Iacopo and Wang, Lin and Newell, Chris and Purver, Matthew", editor = "Mitkov, Ruslan and Angelova, Galia", booktitle = "Proceedings of the 14th International Conference on Recent Advances in Natural Language Processing", month = sep, year = "2023", address = "Varna, Bulgaria", publisher = "INCOMA Ltd., Shoumen, Bulgaria", url = "https://aclanthology.org/2023.ranlp-1.46", pages = "408--418", abstract = "Recent works on linear text segmentation have shown new state-of-the-art results nearly every year. Most times, however, these recent advances include a variety of different elements which makes it difficult to evaluate which individual components of the proposed methods bring about improvements for the task and, more generally, what actually works for linear text segmentation. Moreover, evaluating text segmentation is notoriously difficult and the use of a metric such as Pk, which is widely used in existing literature, presents specific problems that complicates a fair comparison between segmentation models. In this work, then, we draw from a number of existing works to assess which is the state-of-the-art in linear text segmentation, investigating what architectures and features work best for the task. For doing so, we present three models representative of a variety of approaches, we compare them to existing methods and we inspect elements composing them, so as to give a more complete picture of which technique is more successful and why that might be the case. At the same time, we highlight a specific feature of Pk which can bias the results and we report our results using different settings, so as to give future literature a more comprehensive set of baseline results for future developments. We then hope that this work can serve as a solid foundation to foster research in the area, overcoming task-specific difficulties such as evaluation setting and providing new state-of-the-art results.", } @inproceedings{gkoumas-etal-2023-reformulating, title = "Reformulating {NLP} tasks to Capture Longitudinal Manifestation of Language Disorders in People with Dementia.", author = "Gkoumas, Dimitris and Purver, Matthew and Liakata, Maria", editor = "Bouamor, Houda and Pino, Juan and Bali, Kalika", booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", month = dec, year = "2023", address = "Singapore", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.emnlp-main.986", doi = "10.18653/v1/2023.emnlp-main.986", pages = "15904--15917", abstract = "Dementia is associated with language disorders which impede communication. Here, we automatically learn linguistic disorder patterns by making use of a moderately-sized pre-trained language model and forcing it to focus on reformulated natural language processing (NLP) tasks and associated linguistic patterns. Our experiments show that NLP tasks that encapsulate contextual information and enhance the gradient signal with linguistic patterns benefit performance. We then use the probability estimates from the best model to construct digital linguistic markers measuring the overall quality in communication and the intensity of a variety of language disorders. We investigate how the digital markers characterize dementia speech from a longitudinal perspective. We find that our proposed communication marker is able to robustly and reliably characterize the language of people with dementia, outperforming existing linguistic approaches; and shows external validity via significant correlation with clinical markers of behaviour. Finally, our proposed linguistic disorder markers provide useful insights into gradual language impairment associated with disease progression.", } @article{HealeyEtAl2023BBS, title={``Who's there?'': Depicting identity in interaction}, author={Healey, Patrick G. T. and Howes, Christine and Kempson, Ruth and Mills, Gregory J. and Purver, Matthew and Gregoromichelaki, Eleni and Eshghi, Arash and Hough, Julian}, volume={46}, doi={10.1017/S0140525X22001492}, url="https://doi.org/10.1017/S0140525X22001492", journal={Behavioral and Brain Sciences}, publisher={Cambridge University Press}, year={2023}, pages={e37}, } @inproceedings{hosseini-etal-2023-lon, title = "Lon-e{\aa} at {S}em{E}val-2023 Task 11: A Comparison of Activation Functions for Soft and Hard Label Prediction", author = "Hosseini, Peyman and Hosseini, Mehran and Al-azzawi, Sana and Liwicki, Marcus and Castro, Ignacio and Purver, Matthew", editor = {Ojha, Atul Kr. and Do{\u{g}}ru{\"o}z, A. Seza and Da San Martino, Giovanni and Tayyar Madabushi, Harish and Kumar, Ritesh and Sartori, Elisa}, booktitle = "Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023)", month = jul, year = "2023", address = "Toronto, Canada", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.semeval-1.185", doi = "10.18653/v1/2023.semeval-1.185", pages = "1329--1334", abstract = "We study the influence of different activation functions in the output layer of pre-trained transformer models for soft and hard label prediction in the learning with disagreement task. In this task, the goal is to quantify the amount of disagreement via predicting soft labels. To predict the soft labels, we use BERT-based preprocessors and encoders and vary the activation function used in the output layer, while keeping other parameters constant. The soft labels are then used for the hard label prediction. The activation functions considered are sigmoid as well as a step-function that is added to the model post-training and a sinusoidal activation function, which is introduced for the first time in this paper.", } @inproceedings{ivacic-etal-2023-analysis, title = "Analysis of Transfer Learning for Named Entity Recognition in {S}outh-{S}lavic Languages", author = "Iva{\v{c}}i{\v{c}}, Nikola and Tran, Thi Hong Hanh and Koloski, Boshko and Pollak, Senja and Purver, Matthew", editor = "Piskorski, Jakub and Marci{\'n}czuk, Micha{\l} and Nakov, Preslav and Ogrodniczuk, Maciej and Pollak, Senja and P{\v{r}}ib{\'a}{\v{n}}, Pavel and Rybak, Piotr and Steinberger, Josef and Yangarber, Roman", booktitle = "Proceedings of the 9th Workshop on Slavic Natural Language Processing 2023 (SlavicNLP 2023)", month = may, year = "2023", address = "Dubrovnik, Croatia", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.bsnlp-1.13", doi = "10.18653/v1/2023.bsnlp-1.13", pages = "106--112", abstract = "This paper analyzes a Named Entity Recognition task for South-Slavic languages using the pre-trained multilingual neural network models. We investigate whether the performance of the models for a target language can be improved by using data from closely related languages. We have shown that the model performance is not influenced substantially when trained with other than a target language. While for Slovene, the monolingual setting generally performs better, for Croatian and Serbian the results are slightly better in selected cross-lingual settings, but the improvements are not large. The most significant performance improvement is shown for the Serbian language, which has the smallest corpora. Therefore, fine-tuning with other closely related languages may benefit only the {``}low resource{''} languages.", } @inproceedings{karan-etal-2023-leda, title = "{LEDA}: a Large-Organization Email-Based Decision-Dialogue-Act Analysis Dataset", author = "Karan, Mladen and Khare, Prashant and Shekhar, Ravi and McQuistin, Stephen and Castro, Ignacio and Tyson, Gareth and Perkins, Colin and Healey, Patrick and Purver, Matthew", editor = "Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki", booktitle = "Findings of the Association for Computational Linguistics: ACL 2023", month = jul, year = "2023", address = "Toronto, Canada", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.findings-acl.378", doi = "10.18653/v1/2023.findings-acl.378", pages = "6080--6089", abstract = "Collaboration increasingly happens online. This is especially true for large groups working on global tasks, with collaborators all around the globe. The size and distributed nature of such groups makes decision-making challenging. This paper proposes a set of dialog acts for the study of decision-making mechanisms in such groups, and provides a new annotated dataset based on real-world data from the public mail-archives of one such organisation {--} the Internet Engineering Task Force (IETF). We provide an initial data analysis showing that this dataset can be used to better understand decision-making in such organisations. Finally, we experiment with a preliminary transformer-based dialog act tagging model.", } @inproceedings{khare-etal-2023-tracing, title = "Tracing Linguistic Markers of Influence in a Large Online Organisation", author = "Khare, Prashant and Shekhar, Ravi and Karan, Mladen and McQuistin, Stephen and Perkins, Colin and Castro, Ignacio and Tyson, Gareth and Healey, Patrick and Purver, Matthew", editor = "Rogers, Anna and Boyd-Graber, Jordan and Okazaki, Naoaki", booktitle = "Proceedings of the 61st Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)", month = jul, year = "2023", address = "Toronto, Canada", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.acl-short.8", doi = "10.18653/v1/2023.acl-short.8", pages = "82--90", abstract = "Social science and psycholinguistic research have shown that power and status affect how people use language in a range of domains. Here, we investigate a similar question in a large, distributed, consensus-driven community with little traditional power hierarchy {--} the Internet Engineering Task Force (IETF), a collaborative organisation that designs internet standards. Our analysis based on lexical categories (LIWC) and BERT, shows that participants{'} levels of influence can be predicted from their email text, and identify key linguistic differences (e.g., certain LIWC categories, such as {``}WE{''} are positively correlated with high-influence). We also identify the differences in language use for the same person before and after becoming influential.", } @InProceedings{McQuistinEtAl23TMA, author={McQuistin, Stephen and Karan, Mladen and Khare, Prashant and Perkins, Colin and Purver, Matthew and Healey, Patrick and Castro, Ignacio and Tyson, Gareth}, booktitle={Proceedings of the 7th Network Traffic Measurement and Analysis Conference ({TMA})}, title={Errare humanum est: What do {RFC} Errata say about Internet Standards?}, year={2023}, volume={}, number={}, pages={1-9}, isbn="978-3-903176-58-4", doi={10.23919/TMA58422.2023.10198980}, url="https://doi.org/10.23919/TMA58422.2023.10198980", url="http://www.eecs.qmul.ac.uk/~mpurver/papers/mcquistin-et-al23tma.pdf", } @inproceedings{nakwijit-etal-2023-lexicools, title = "Lexicools at {S}em{E}val-2023 Task 10: Sexism Lexicon Construction via {XAI}", author = "Nakwijit, Pakawat and Samir, Mahmoud and Purver, Matthew", editor = {Ojha, Atul Kr. and Do{\u{g}}ru{\"o}z, A. Seza and Da San Martino, Giovanni and Tayyar Madabushi, Harish and Kumar, Ritesh and Sartori, Elisa}, booktitle = "Proceedings of the 17th International Workshop on Semantic Evaluation (SemEval-2023)", month = jul, year = "2023", address = "Toronto, Canada", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.semeval-1.4", doi = "10.18653/v1/2023.semeval-1.4", pages = "23--43", abstract = "This paper presents our work on the SemEval-2023 Task 10 Explainable Detection of Online Sexism (EDOS) using lexicon-based models. Our approach consists of three main steps: lexicon construction based on Pointwise Mutual Information (PMI) and Shapley value, lexicon augmentation using an unannotated corpus and Large Language Models (LLMs), and, lastly, lexical incorporation for Bag-of-Word (BoW) logistic regression and fine-tuning LLMs. Our results demonstrate that our Shapley approach effectively produces a high-quality lexicon. We also show that by simply counting the presence of certain words in our lexicons and comparing the count can outperform a BoW logistic regression in task B/C and fine-tuning BERT in task C. In the end, our classifier achieved F1-scores of 53.34{\textbackslash}{\%} and 27.31{\textbackslash}{\%} on the official blind test sets for tasks B and C, respectively. We, additionally, provide in-depth analysis highlighting model limitation and bias. We also present our attempts to understand the model{'}s behaviour based on our constructed lexicons. Our code and the resulting lexicons are open-sourced in our GitHub repository \url{https://github.com/SirBadr/SemEval2022-Task10}.", } @InCollection{WrightPurver2023AGI, author="Wright, George A. and Purver, Matthew", editor="Hammer, Patrick and Alirezaie, Marjan and Stranneg{\o{a}}rd, Claes", title="Self-Comprehension for More Coherent Language Generation", booktitle="Artificial General Intelligence: 16th International Conference, AGI 2023", year=2023, series="Lecture Notes in Artificial Intelligence", number=13921, publisher="Springer", pages="328--337", issn="0302-9743", isbn="978-3-031-33468-9", doi = "10.1007/978-3-031-33469-6_33", url = "https://doi.org/10.1007/978-3-031-33469-6_33", }