@inproceedings{MetzlerPl{\"o}gerHees2024, author = {Metzler, Tim and Pl{\"o}ger, Paul G. and Hees, J{\"o}rn}, title = {Computer-Assisted Short Answer Grading Using Large Language Models and Rubrics}, booktitle = {Klein, Krupka et al. (Hg.): Informatik 2024. Tagung vom 24.-26. September 2024, Wiesbaden. Lock-in or log out? - Wie digitale Souver{\"a}nit{\"a}t gelingt}, isbn = {978-3-88579-746-3}, doi = {10.18420/inf2024_121}, institution = {Fachbereich Informatik}, pages = {1383 -- 1393}, year = {2024}, abstract = {Grading student answers and providing feedback are essential yet time-consuming tasks for educators. Recent advancements in Large Language Models (LLMs), including ChatGPT, Llama, and Mistral, have paved the way for automated support in this domain. This paper investigates the efficacy of instruction-following LLMs in adhering to predefined rubrics for evaluating student answers and delivering meaningful feedback. Leveraging the Mohler dataset and a custom German dataset, we evaluate various models, from commercial ones like ChatGPT to smaller open-source options like Llama, Mistral, and Command R. Additionally, we explore the impact of temperature parameters and techniques such as few-shot prompting. Surprisingly, while few-shot prompting enhances grading accuracy closer to ground truth, it introduces model inconsistency. Furthermore, some models exhibit non-deterministic behavior even at near-zero temperature settings. Our findings highlight the importance of rubrics in enhancing the interpretability of model outputs and fostering consistency in grading practices.}, language = {en} }