Liu, Shimiao; Lerch, Alexander Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024. Abstract | Links | BibTeX | Tags: Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers2024
@inproceedings{liu_enhancing_2024,
title = {Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings},
author = {Shimiao Liu and Alexander Lerch},
url = {https://ieeexplore.ieee.org/abstract/document/10704086},
doi = {10.1109/IS262782.2024.10704086},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2)},
address = {Erlangen},
abstract = {A fitting soundtrack can help a video better convey its content and provide a better immersive experience. This paper introduces a novel approach utilizing self-supervised learning and contrastive learning to automatically recommend audio for video content, thereby eliminating the need for manual labeling. We use a dual-branch cross-modal embedding model that maps both audio and video features into a common low-dimensional space. The fit of various audio-video pairs can then be modeled as inverse distance measure. In addition, a comparative analysis of various temporal encoding methods is presented, emphasizing the effectiveness of transformers in managing the temporal information of audio-video matching tasks. Through multiple experiments, we demonstrate that our model TIVM, which integrates transformer encoders and using InfoNCE loss, significantly improves the performance of audio-video matching and surpasses traditional methods.},
keywords = {Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers},
pubstate = {published},
tppubtype = {inproceedings}
}
publications
Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024.2024