Liu, Shimiao; Lerch, Alexander Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024. Abstract | Links | BibTeX | Tags: Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers Wang, Ju-Chiang; Hung, Yun-Ning; Smith, Jordan B. L. To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions Proceedings Article In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 416–420, 2022, (ISSN: 2379-190X). Abstract | Links | BibTeX | Tags: Location awareness, music, Music structure, segmentation, semantic labeling, Semantics, Signal processing, Signal processing algorithms, SpecTNT, Taxonomy, Transformer, Transformers Hung, Yun-Ning; Wichern, Gordon; Roux, Jonathan Le Transcription Is All You Need: Learning To Separate Musical Mixtures With Score As Supervision Proceedings Article In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 46–50, 2021, (ISSN: 2379-190X). Abstract | Links | BibTeX | Tags: audio source separation, Conferences, Instruments, music, music transcription, Particle separators, Source separation, Time-frequency analysis, Training, weakly-labeled data, weakly-supervised separation Lerch, Alexander An Introduction to Audio Content Analysis: Applications in Signal Processing and Music Informatics Book Wiley-IEEE Press, Hoboken, 2012, ISBN: 978-1-118-26682-3. Abstract | Links | BibTeX | Tags: analysis, audio, audio signal processing, information, listening, machine, machine listening, music, music analysis, music information retrieval, processing, retrieval, signal Lerch, Alexander Software-Based Extraction of Objective Parameters from Music Performances Book GRIN Verlag, München, 2009, ISBN: 978-3-640-29496-1. Abstract | Links | BibTeX | Tags: analysis, audio, content, information, music, performance, retrieval2024
@inproceedings{liu_enhancing_2024,
title = {Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings},
author = {Shimiao Liu and Alexander Lerch},
url = {https://ieeexplore.ieee.org/abstract/document/10704086},
doi = {10.1109/IS262782.2024.10704086},
year = {2024},
date = {2024-01-01},
booktitle = {Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2)},
address = {Erlangen},
abstract = {A fitting soundtrack can help a video better convey its content and provide a better immersive experience. This paper introduces a novel approach utilizing self-supervised learning and contrastive learning to automatically recommend audio for video content, thereby eliminating the need for manual labeling. We use a dual-branch cross-modal embedding model that maps both audio and video features into a common low-dimensional space. The fit of various audio-video pairs can then be modeled as inverse distance measure. In addition, a comparative analysis of various temporal encoding methods is presented, emphasizing the effectiveness of transformers in managing the temporal information of audio-video matching tasks. Through multiple experiments, we demonstrate that our model TIVM, which integrates transformer encoders and using InfoNCE loss, significantly improves the performance of audio-video matching and surpasses traditional methods.},
keywords = {Contrastive learning, Encoding, Fitting, Immersive experience, Internet, Labeling, Manuals, multi-modal, music, music recommendation, Recommender systems, trans-former, Transformers},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
@inproceedings{wang_catch_2022,
title = {To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions},
author = {Ju-Chiang Wang and Yun-Ning Hung and Jordan B. L. Smith},
url = {https://ieeexplore.ieee.org/abstract/document/9747252/authors#authors},
doi = {10.1109/ICASSP43922.2022.9747252},
year = {2022},
date = {2022-05-01},
urldate = {2024-02-08},
booktitle = {ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {416\textendash420},
abstract = {Conventional music structure analysis algorithms aim to divide a song into segments and to group them with abstract labels (e.g., ‘A’, ‘B’, and ‘C’). However, explicitly identifying the function of each segment (e.g., ‘verse’ or ‘chorus’) is rarely attempted, but has many applications. We introduce a multi-task deep learning framework to model these structural semantic labels directly from audio by estimating "verseness," "chorusness," and so forth, as a function of time. We propose a 7-class taxonomy (i.e., intro, verse, chorus, bridge, outro, instrumental, and silence) and provide rules to consolidate annotations from four disparate datasets. We also propose to use a spectral-temporal Transformer-based model, called SpecTNT, which can be trained with an additional connectionist temporal localization (CTL) loss. In cross-dataset evaluations using four public datasets, we demonstrate the effectiveness of the SpecTNT model and CTL loss, and obtain strong results overall: the proposed system outperforms state-of-the-art chorus-detection and boundary-detection methods at detecting choruses and boundaries, respectively.},
note = {ISSN: 2379-190X},
keywords = {Location awareness, music, Music structure, segmentation, semantic labeling, Semantics, Signal processing, Signal processing algorithms, SpecTNT, Taxonomy, Transformer, Transformers},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
@inproceedings{hung_transcription_2021,
title = {Transcription Is All You Need: Learning To Separate Musical Mixtures With Score As Supervision},
author = {Yun-Ning Hung and Gordon Wichern and Jonathan Le Roux},
url = {https://ieeexplore.ieee.org/abstract/document/9413358/authors#authors},
doi = {10.1109/ICASSP39728.2021.9413358},
year = {2021},
date = {2021-06-01},
urldate = {2024-02-08},
booktitle = {ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)},
pages = {46\textendash50},
abstract = {Most music source separation systems require large collections of isolated sources for training, which can be difficult to obtain. In this work, we use musical scores, which are comparatively easy to obtain, as a weak label for training a source separation system. In contrast with previous score-informed separation approaches, our system does not require isolated sources, and score is used only as a training target, not required for inference. Our model consists of a separator that outputs a time-frequency mask for each instrument, and a transcriptor that acts as a critic, providing both temporal and frequency supervision to guide the learning of the separator. A harmonic mask constraint is introduced as another way of leveraging score information during training, and we propose two novel adversarial losses for additional fine-tuning of both the transcriptor and the separator. Results demonstrate that using score information outper-forms temporal weak-labels, and adversarial structures lead to further improvements in both separation and transcription performance.},
note = {ISSN: 2379-190X},
keywords = {audio source separation, Conferences, Instruments, music, music transcription, Particle separators, Source separation, Time-frequency analysis, Training, weakly-labeled data, weakly-supervised separation},
pubstate = {published},
tppubtype = {inproceedings}
}
2012
@book{lerch_introduction_2012,
title = {An Introduction to Audio Content Analysis: Applications in Signal Processing and Music Informatics},
author = {Alexander Lerch},
url = {http://ieeexplore.ieee.org/xpl/bkabstractplus.jsp?bkn=6266785},
isbn = {978-1-118-26682-3},
year = {2012},
date = {2012-01-01},
publisher = {Wiley-IEEE Press},
address = {Hoboken},
abstract = {With the proliferation of digital audio distribution over digital media, audio content analysis is fast becoming a requirement for designers of intelligent signal-adaptive audio processing systems. Written by a well-known expert in the field, this book provides quick access to different analysis algorithms and allows comparison between different approaches to the same task, making it useful for newcomers to audio signal processing and industry experts alike. A review of relevant fundamentals in audio signal processing, psychoacoustics, and music theory, as well as downloadable MATLAB files are also included. Please visit the companion website: www.AudioContentAnalysis.org},
keywords = {analysis, audio, audio signal processing, information, listening, machine, machine listening, music, music analysis, music information retrieval, processing, retrieval, signal},
pubstate = {published},
tppubtype = {book}
}
2009
@book{lerch_software-based_2009,
title = {Software-Based Extraction of Objective Parameters from Music Performances},
author = {Alexander Lerch},
url = {http://dx.doi.org/10.14279/depositonce-2025},
isbn = {978-3-640-29496-1},
year = {2009},
date = {2009-01-01},
publisher = {GRIN Verlag},
address = {M\"{u}nchen},
abstract = {Different music performances of the same score may significantly differ from each other. It is obvious that not only the composer’s work, the score, defines the listener’s music experience, but that the music performance itself is an integral part of this experience. Music performers use the information contained in the score, but interpret, transform or add to this information. Four parameter classes can be used to describe a performance objectively: tempo and timing, loudness, timbre and pitch. Each class contains a multitude of individual parameters that are at the performers’ disposal to generate a unique physical rendition of musical ideas. The extraction of such objective parameters is one of the difficulties in music performance research. This work presents an approach to the software-based extraction of tempo and timing, loudness and timbre parameters from audio files to provide a tool for the automatic parameter extraction from music performances. The system is applied to extract data from 21 string quartet performances and a detailed analysis of the extracted data is presented. The main contributions of this thesis are the adaptation and development of signal processing approaches to performance parameter extraction and the presentation and discussion of string quartet performances of a movement of Beethoven’s late String Quartet op. 130.},
keywords = {analysis, audio, content, information, music, performance, retrieval},
pubstate = {published},
tppubtype = {book}
}
publications
Enhancing Video Music Recommendation with Transformer-Driven Audio-Visual Embeddings Proceedings Article In: Proceedings of the IEEE International Symposium on the Internet of Sounds (IS2), Erlangen, 2024. To Catch A Chorus, Verse, Intro, or Anything Else: Analyzing a Song with Structural Functions Proceedings Article In: ICASSP 2022 - 2022 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 416–420, 2022, (ISSN: 2379-190X). Transcription Is All You Need: Learning To Separate Musical Mixtures With Score As Supervision Proceedings Article In: ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP), pp. 46–50, 2021, (ISSN: 2379-190X). An Introduction to Audio Content Analysis: Applications in Signal Processing and Music Informatics Book Wiley-IEEE Press, Hoboken, 2012, ISBN: 978-1-118-26682-3. Software-Based Extraction of Objective Parameters from Music Performances Book GRIN Verlag, München, 2009, ISBN: 978-3-640-29496-1.2024
2022
2021
2012
2009