@inproceedings{RavinderFellerhofDadietal.2023, author = {Ravinder, Rohitha and Fellerhof, Tim and Dadi, Vishnu and Geist, Lukas and Rocamora, Guillermo and Talha, Muhammad and Rebholz-Schuhmann, Dietrich and Castro, Leyla Jael}, title = {A Comparison of Vector-based Approaches for Document Similarity Using the RELISH Corpus}, booktitle = {Proceedings SeWebMeDa-2023: 6th International Workshop on Semantic Web solutions for large-scale biomedical data analytics, May 29, 2023, Hersonissos, Greece}, url = {https://ceur-ws.org/Vol-3466\#paper5}, pages = {10}, year = {2023}, abstract = {The continuously increasing number of biomedical scholarly publications makes it challenging to construct document recommendation algorithms that can efficiently navigate through literature. Such algorithms would help researchers in finding similar, relevant, and related publications that align with their research interests. Natural Language Processing offers various alternatives to compare publications, ranging from entity recognition to document embeddings. In this paper, we present the results of a comparative analysis of vector-based approaches to assess document similarity in the RELISH corpus. We aim to determine the best approach that resembles relevance without the need for further training. Specifically, we employ five different techniques to generate vectors representing the text in the documents. These techniques employ a combination of various Natural Language Processing frameworks such as Word2Vec, Doc2Vec, dictionary-based Named Entity Recognition, and state-of-the-art models based on BERT. To evaluate the document similarity obtained by these approaches, we utilize different evaluation metrics that account for relevance judgment, relevance search, and re-ranking of the relevance search. Our results demonstrate that the most promising approach is an in-house version of document embeddings, starting with word embeddings and using centroids to aggregate them by document.}, language = {en} }