Taste of Research
ALTA 2025
🏆 Best Paper Honorable Mention
PDF
BibTeX
@inproceedings{singh-etal-2025-nek,
title = "{N}ek Minit: Harnessing Pragmatic Metacognitive Prompting for Explainable Sarcasm Detection of {A}ustralian and {I}ndian {E}nglish",
author = "Singh, Ishmanbir and
Srirag, Dipankar and
Joshi, Aditya",
editor = "Kummerfeld, Jonathan K. and
Joshi, Aditya and
Dras, Mark",
booktitle = "Proceedings of the 23rd Annual Workshop of the Australasian Language Technology Association",
month = nov,
year = "2025",
address = "Sydney, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.alta-main.2/",
pages = "13--27",
ISBN = "1834-7037",
abstract = "Sarcasm is a challenge to sentiment analysis because of the incongruity between stated and implied sentiment. The challenge is exacerbated when the implication may be relevant to a specific country or geographical region. Pragmatic metacognitive prompting (PMP) is a cognition-inspired technique that has been used for pragmatic reasoning. In this paper, we harness PMP for explainable sarcasm detection for Australian and Indian English, alongside a benchmark dataset for standard English. We manually add sarcasm explanations to an existing sarcasm-labeled dataset for Australian and Indian English called BESSTIE, and compare the performance for explainable sarcasm detection for them with FLUTE, a standard English dataset containing sarcasm explanations. Our approach utilising PMP when evaluated on two open-weight LLMs (GEMMA and LLAMA) achieves statistically significant performance improvement across all tasks and datasets when compared with four alternative prompting strategies. We also find that alternative techniques such as agentic prompting mitigate context-related failures by enabling external knowledge retrieval. The focused contribution of our work is utilising PMP in generating sarcasm explanations for varieties of English."
}
Copied!ALTA 2024
PDF
BibTeX
@inproceedings{chan-etal-2024-hate,
title = "``Is Hate Lost in Translation?'': Evaluation of Multilingual {LGBTQIA}+ Hate Speech Detection",
author = "Chan, Fai Leui and
Nguyen, Duke and
Joshi, Aditya",
editor = "Baldwin, Tim and
Rodr{\'i}guez M{\'e}ndez, Sergio Jos{\'e} and
Kuo, Nicholas",
booktitle = "Proceedings of the 22nd Annual Workshop of the Australasian Language Technology Association",
month = dec,
year = "2024",
address = "Canberra, Australia",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.alta-1.11/",
pages = "146--152",
abstract = "This paper explores the challenges of detecting LGBTQIA+ hate speech of large language models across multiple languages, including English, Italian, Chinese and (code-mixed) English-Tamil, examining the impact of machine translation and whether the nuances of hate speech are preserved across translation. We examine the hate speech detection ability of zero-shot and fine-tuned GPT. Our findings indicate that: (1) English has the highest performance and the code-mixing scenario of English-Tamil being the lowest, (2) fine-tuning improves performance consistently across languages whilst translation yields mixed results. Through simple experimentation with original text and machine-translated text for hate speech detection along with a qualitative error analysis, this paper sheds light on the socio-cultural nuances and complexities of languages that may not be captured by automatic translation."
}
Copied!CMCL @ ACL 2024
PDF
BibTeX
@inproceedings{shen-etal-2024-bambino,
title = "{BAMBINO}-{LM}: (Bilingual-)Human-Inspired Continual Pre-training of {B}aby{LM}",
author = "Shen, Zhewen and
Joshi, Aditya and
Chen, Ruey-Cheng",
editor = "Kuribayashi, Tatsuki and
Rambelli, Giulia and
Takmaz, Ece and
Wicke, Philipp and
Oseki, Yohei",
booktitle = "Proceedings of the Workshop on Cognitive Modeling and Computational Linguistics",
month = aug,
year = "2024",
address = "Bangkok, Thailand",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2024.cmcl-1.1/",
doi = "10.18653/v1/2024.cmcl-1.1",
pages = "1--7",
abstract = "Children from bilingual backgrounds benefit from interactions with parents and teachers to re-acquire their heritage language. In this paper, we investigate how this insight from behavioral study can be incorporated into the learning of small-scale language models. We introduce BAMBINO-LM, a continual pre-training strategy for BabyLM that uses a novel combination of alternation and PPO-based perplexity reward induced from a parent Italian model. Upon evaluation on zero-shot classification tasks for English and Italian, BAMBINO-LM improves the Italian language capability of a BabyLM baseline. Our ablation analysis demonstrates that employing both the alternation strategy and PPO-based modeling is key to this effectiveness gain. We also show that, as a side effect, the proposed method leads to a similar degradation in L1 effectiveness as human children would have had in an equivalent learning scenario. Through its modeling and findings, BAMBINO-LM makes a focused contribution to the pre-training of small-scale language models by first developing a human-inspired strategy for pre-training and then showing that it results in behaviours similar to that of humans."
}
Copied!Honours Projects
WWW 2025
PDF
BibTeX
@inproceedings{10.1145/3701716.3715501,
author = {Lin, Jonathan and Joshi, Aditya and Paik, Hye-young and Doung, Tri Dung and Gurdasani, Deepti},
title = {RACCOON: A Retrieval-Augmented Generation Approach for Location Coordinate Capture from News Articles},
year = {2025},
isbn = {9798400713316},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3701716.3715501},
doi = {10.1145/3701716.3715501},
abstract = {Geocoding involves automatic extraction of location coordinates of incidents reported in news articles, and can be used for epidemic intelligence or disaster management. This paper introduces Retrieval-Augmented Coordinate Capture Of Online News articles (RACCOON), an open-source geocoding approach that extracts geolocations from news articles. RACCOON uses a retrieval-augmented generation (RAG) approach where candidate locations and associated information are retrieved in the form of context from a location database, and a prompt containing the retrieved context, location mentions and news articles is fed to an LLM to generate the location coordinates. Our evaluation on three datasets, two underlying LLMs, three baselines and several ablation tests based on the components of RACCOON demonstrate the utility of RACCOON. To the best of our knowledge, RACCOON is the first RAG-based approach for geocoding using pre-trained LLMs.},
booktitle = {Companion Proceedings of the ACM on Web Conference 2025},
pages = {1123–1127},
numpages = {5},
keywords = {geocoding, large language models, location extraction, news articles, rag, retrieval-augmented generation},
location = {Sydney NSW, Australia},
series = {WWW '25}
}
Copied!COMP9991
PDF
BibTeX
@misc{kong2025cairnsbalancingreadabilityscientific,
title={CAIRNS: Balancing Readability and Scientific Accuracy in Climate Adaptation Question Answering},
author={Liangji Kong and Aditya Joshi and Sarvnaz Karimi},
year={2025},
eprint={2512.02251},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2512.02251},
}
Copied!NAACL 2025
PDF
Code
BibTeX
@inproceedings{srirag-etal-2025-predicting,
title = "Predicting the Target Word of Game-playing Conversations using a Low-Rank Dialect Adapter for Decoder Models",
author = "Srirag, Dipankar and
Joshi, Aditya and
Eisenstein, Jacob",
editor = "Chiruzzo, Luis and
Ritter, Alan and
Wang, Lu",
booktitle = "Proceedings of the 2025 Conference of the Nations of the Americas Chapter of the Association for Computational Linguistics: Human Language Technologies (Volume 2: Short Papers)",
month = apr,
year = "2025",
address = "Albuquerque, New Mexico",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.naacl-short.2/",
doi = "10.18653/v1/2025.naacl-short.2",
pages = "8--17",
ISBN = "979-8-89176-190-2",
abstract = "Dialect adapters that improve the performance of LLMs for NLU tasks on certain sociolects/dialects/national varieties ({`}dialects' for the sake of brevity) have been reported for encoder models. In this paper, we extend the idea of dialect adapters to decoder models in our architecture called LoRDD. Using MD-3, a publicly available dataset of word game-playing conversations between dialectal speakers, our task is Target Word Prediction (TWP) from a masked conversation. LoRDD combines task adapters and dialect adapters where the latter employ contrastive learning on pseudo-parallel conversations from MD-3. Our experiments on Indian English and Nigerian English conversations with two models (Mistral and Gemma) demonstrate that LoRDD outperforms four baselines on TWP. Additionally, it significantly reduces the performance gap with American English, narrowing it to 12{\%} and 5.8{\%} for word similarity, and 25{\%} and 4.5{\%} for accuracy, respectively. The focused contribution of LoRDD is in its promise for dialect adaptation of decoder models using TWP, a simplified version of the commonly used next-word prediction task."
}
Copied!SUMEval @ COLING 2025
PDF
BibTeX
@inproceedings{srirag-etal-2025-evaluating,
title = "Evaluating Dialect Robustness of Language Models via Conversation Understanding",
author = "Srirag, Dipankar and
Sahoo, Nihar Ranjan and
Joshi, Aditya",
booktitle = "Proceedings of the Second Workshop on Scaling Up Multilingual {\&} Multi-Cultural Evaluation",
month = jan,
year = "2025",
address = "Abu Dhabi",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2025.sumeval-2.3/",
pages = "24--38",
abstract = "With an evergrowing number of LLMs reporting superlative performance for English, their ability to perform equitably for different dialects of English (i.e., dialect robustness) needs to be ascertained. Specifically, we use English language (US English or Indian English) conversations between humans who play the word-guessing game of `taboo{`}. We formulate two evaluative tasks: target word prediction (TWP) (i.e., predict the masked target word in a conversation) and target word selection (TWS) (i.e., select the most likely masked target word in a conversation, from among a set of candidate words). Extending MD3, an existing dialectic dataset of taboo-playing conversations, we introduce M-MD3, a target-word-masked version of MD3 with the en-US and en-IN subsets. We create two subsets: en-MV (where en-US is transformed to include dialectal information) and en-TR (where dialectal information is removed from en-IN). We evaluate three multilingual LLMs{--}one open source (Llama3) and two closed-source (GPT-4/3.5). LLMs perform significantly better for US English than Indian English for both TWP and TWS tasks, for all settings, exhibiting marginalisation against the Indian dialect of English. While GPT-based models perform the best, the comparatively smaller models work more equitably after fine-tuning. Our evaluation methodology exhibits a novel and reproducible way to examine attributes of language models using pre-existing dialogue datasets with language varieties. Dialect being an artifact of one{'}s culture, this paper demonstrates the gap in the performance of multilingual LLMs for communities that do not use a mainstream dialect."
}
Copied!ACM Transactions on Intelligent Systems and Technology
PDF
Code
BibTeX
@article{10.1145/3768161,
author = {Nguyen, Duke and Yin, Du and Joshi, Aditya and Salim, Flora},
title = {Spectraformer: A Unified Random Feature Framework for Transformer},
year = {2026},
issue_date = {June 2026},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
volume = {17},
number = {3},
issn = {2157-6904},
url = {https://doi.org/10.1145/3768161},
doi = {10.1145/3768161},
abstract = {Linearization of attention using various kernel approximation and kernel learning techniques has shown promise. Past methods used a subset of combinations of component functions and weight matrices within the random feature paradigm. We identify the need for a systematic comparison of different combinations of weight matrices and component functions for attention learning in Transformer. Hence, we introduce Spectraformer, a unified framework for approximating and learning the kernel function in the attention mechanism of the Transformer. Our empirical results demonstrate, for the first time, that a random feature-based approach can achieve performance comparable to top-performing sparse and low-rank methods on the challenging Long-Range Arena benchmark. Thus, we establish a new state-of-the-art for random feature-based efficient Transformers. The framework also produces many variants that offer different advantages in accuracy, training time, and memory consumption. Our code is available at: .},
journal = {ACM Trans. Intell. Syst. Technol.},
month = mar,
articleno = {50},
numpages = {29},
keywords = {transformers, kernel, linearized attention, kernelized attention}
}
Copied!