Liu, Shimiao; Lerch, Alexander Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024. Abstract | Links | BibTeX | Tags: Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers Wang, Ju-Chiang; Hung, Yun-Ning; Smith, Jordan B. L. To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions Proceedings Article In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 416–420, 2022, (ISSN: 2379-190X). Abstract | Links | BibTeX | Tags: Location awareness, music, Music structure, segmentation, semantic labeling, Semantics, Signal processing, Signal processing algorithms, SpecTNT, Taxonomy, Transformer, Transformers2024
@inproceedings{liu_enhancing_2024,
title = {Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings},
author = {Shimiao Liu and Alexander Lerch},
url = {https://ieeexplore.ieee.org/abstract/document/10704086},
doi = {10.1109/IS262782.2024.10704086},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2)},
address = {Erlangen},
abstract = {A fitting soundtrack can help a video better convey its content and provide a better immersive experience. This paper introduces a novel approach utilizing self-supervised learning and contrastive learning to automatically recommend audio for video content, thereby eliminating the need for manual labeling. We use a dual-branch cross-modal embedding model that maps both audio and video features into a common low-dimensional space. The fit of various audio-video pairs can then be modeled as inverse distance measure. In addition, a comparative analysis of various temporal encoding methods is presented, emphasizing the effectiveness of transformers in managing the temporal information of audio-video matching tasks. Through multiple experiments, we demonstrate that our model TIVM, which integrates transformer encoders and using InfoNCE loss, significantly improves the performance of audio-video matching and surpasses traditional methods.},
keywords = {Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
@inproceedings{wang_catch_2022,
title = {To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions},
author = {Ju-Chiang Wang and Yun-Ning Hung and Jordan B. L. Smith},
url = {https://ieeexplore.ieee.org/abstract/document/9747252/authors#authors},
doi = {10.1109/ICASSP43922.2022.9747252},
year = {2022},
date = {2022-05-01},
urldate = {2024-02-08},
booktitle = {ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {416\textendash420},
abstract = {Conventional music structure analysis algorithms aim to divide a song into segments and to group them with abstract labels (e.g., ‘A’, ‘B’, and ‘C’). However, explicitly identifying the function of each segment (e.g., ‘verse’ or ‘chorus’) is rarely attempted, but has many applications. We introduce a multi-task deep learning framework to model these structural semantic labels directly from audio by estimating "verseness," "chorusness," and so forth, as a function of time. We propose a 7-class taxonomy (i.e., intro, verse, chorus, bridge, outro, instrumental, and silence) and provide rules to consolidate annotations from four disparate datasets. We also propose to use a spectral-temporal Transformer-based model, called SpecTNT, which can be trained with an additional connectionist temporal localization (CTL) loss. In cross-dataset evaluations using four public datasets, we demonstrate the effectiveness of the SpecTNT model and CTL loss, and obtain strong results overall: the proposed system outperforms state-of-the-art chorus-detection and boundary-detection methods at detecting choruses and boundaries, respectively.},
note = {ISSN: 2379-190X},
keywords = {Location awareness, music, Music structure, segmentation, semantic labeling, Semantics, Signal processing, Signal processing algorithms, SpecTNT, Taxonomy, Transformer, Transformers},
pubstate = {published},
tppubtype = {inproceedings}
}
publications
Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024. To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions Proceedings Article In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 416–420, 2022, (ISSN: 2379-190X).2024
2022