Ding, Yiwei; Lerch, Alexander Audio Embeddings as Teachers for Music Classification Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Milan, Italy, 2023. Abstract | Links | BibTeX | Tags: Computer Science - Information Retrieval, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Ma, Alison B; Lerch, Alexander Representation Learning for the Automatic Indexing of Sound Effects Libraries Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Bangalore, IN, 2022, (arXiv:2208.09096 [cs, eess]). Abstract | Links | BibTeX | Tags: Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Vinay, Ashvala; Lerch, Alexander Evaluating Generative Audio Systems and their Metrics Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Bangalore, IN, 2022, (arXiv:2209.00130 [cs, eess]). Abstract | Links | BibTeX | Tags: Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Kalbag, Vedant; Lerch, Alexander Scream Detection in Heavy Metal Music Proceedings Article In: Proceedings of the Sound and Music Computing Conference (SMC), Saint-Etienne, 2022. Abstract | Links | BibTeX | Tags: Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Hung, Yun-Ning; Lerch, Alexander Feature-informed Embedding Space Regularization for Audio Classification Proceedings Article In: Proceedings of the European Signal Processing Conference (EUSIPCO), Belgrade, Serbia, 2022. Abstract | Links | BibTeX | Tags: Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Watcharasupat, Karn N; Lerch, Alexander Evaluation of Latent Space Disentanglement in the Presence of Interdependent Attributes Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Online, 2021. Abstract | Links | BibTeX | Tags: Computer Science - Information Retrieval, Computer Science - Information Theory, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing2023
@inproceedings{ding_audio_2023,
title = {Audio Embeddings as Teachers for Music Classification},
author = {Yiwei Ding and Alexander Lerch},
url = {http://arxiv.org/abs/2306.17424},
doi = {10.48550/arXiv.2306.17424},
year = {2023},
date = {2023-06-01},
urldate = {2023-06-01},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Milan, Italy},
abstract = {Music classification has been one of the most popular tasks in the field of music information retrieval. With the development of deep learning models, the last decade has seen impressive improvements in a wide range of classification tasks. However, the increasing model complexity makes both training and inference computationally expensive. In this paper, we integrate the ideas of transfer learning and feature-based knowledge distillation and systematically investigate using pre-trained audio embeddings as teachers to guide the training of low-complexity student networks. By regularizing the feature space of the student networks with the pre-trained embeddings, the knowledge in the teacher embeddings can be transferred to the students. We use various pre-trained audio embeddings and test the effectiveness of the method on the tasks of musical instrument classification and music auto-tagging. Results show that our method significantly improves the results in comparison to the identical model trained without the teacher's knowledge. This technique can also be combined with classical knowledge distillation approaches to further improve the model's performance.},
keywords = {Computer Science - Information Retrieval, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2022
@inproceedings{ma_representation_2022,
title = {Representation Learning for the Automatic Indexing of Sound Effects Libraries},
author = {Alison B Ma and Alexander Lerch},
url = {http://arxiv.org/abs/2208.09096},
doi = {10.48550/arXiv.2208.09096},
year = {2022},
date = {2022-08-01},
urldate = {2022-08-22},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Bangalore, IN},
abstract = {Labeling and maintaining a commercial sound effects library is a time-consuming task exacerbated by databases that continually grow in size and undergo taxonomy updates. Moreover, sound search and taxonomy creation are complicated by non-uniform metadata, an unrelenting problem even with the introduction of a new industry standard, the Universal Category System. To address these problems and overcome dataset-dependent limitations that inhibit the successful training of deep learning models, we pursue representation learning to train generalized embeddings that can be used for a wide variety of sound effects libraries and are a taxonomy-agnostic representation of sound. We show that a task-specific but dataset-independent representation can successfully address data issues such as class imbalance, inconsistent class labels, and insufficient dataset size, outperforming established representations such as OpenL3. Detailed experimental results show the impact of metric learning approaches and different cross-dataset training methods on representational effectiveness.},
note = {arXiv:2208.09096 [cs, eess]},
keywords = {Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{vinay_evaluating_2022,
title = {Evaluating Generative Audio Systems and their Metrics},
author = {Ashvala Vinay and Alexander Lerch},
url = {http://arxiv.org/abs/2209.00130},
doi = {10.48550/arXiv.2209.00130},
year = {2022},
date = {2022-08-01},
urldate = {2022-09-03},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Bangalore, IN},
abstract = {Recent years have seen considerable advances in audio synthesis with deep generative models. However, the state-of-the-art is very difficult to quantify; different studies often use different evaluation methodologies and different metrics when reporting results, making a direct comparison to other systems difficult if not impossible. Furthermore, the perceptual relevance and meaning of the reported metrics in most cases unknown, prohibiting any conclusive insights with respect to practical usability and audio quality. This paper presents a study that investigates state-of-the-art approaches side-by-side with (i) a set of previously proposed objective metrics for audio reconstruction, and with (ii) a listening study. The results indicate that currently used objective metrics are insufficient to describe the perceptual quality of current systems.},
note = {arXiv:2209.00130 [cs, eess]},
keywords = {Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{kalbag_scream_2022,
title = {Scream Detection in Heavy Metal Music},
author = {Vedant Kalbag and Alexander Lerch},
url = {http://arxiv.org/abs/2205.05580},
doi = {10.48550/arXiv.2205.05580},
year = {2022},
date = {2022-01-01},
booktitle = {Proceedings of the Sound and Music Computing Conference (SMC)},
address = {Saint-Etienne},
abstract = {Harsh vocal effects such as screams or growls are far more common in heavy metal vocals than the traditionally sung vocal. This paper explores the problem of detection and classification of extreme vocal techniques in heavy metal music, specifically the identification of different scream techniques. We investigate the suitability of various feature representations, including cepstral, spectral, and temporal features as input representations for classification. The main contributions of this work are (i) a manually annotated dataset comprised of over 280 minutes of heavy metal songs of various genres with a statistical analysis of occurrences of different extreme vocal techniques in heavy metal music, and (ii) a systematic study of different input feature representations for the classification of heavy metal vocals},
keywords = {Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{hung_feature-informed_2022,
title = {Feature-informed Embedding Space Regularization for Audio Classification},
author = {Yun-Ning Hung and Alexander Lerch},
url = {http://arxiv.org/abs/2206.04850},
doi = {10.48550/arXiv.2206.04850},
year = {2022},
date = {2022-01-01},
booktitle = {Proceedings of the European Signal Processing Conference (EUSIPCO)},
address = {Belgrade, Serbia},
abstract = {Feature representations derived from models pre-trained on large-scale datasets have shown their generalizability on a variety of audio analysis tasks. Despite this generalizability, however, task-specific features can outperform if sufficient training data is available, as specific task-relevant properties can be learned. Furthermore, the complex pre-trained models bring considerable computational burdens during inference. We propose to leverage both detailed task-specific features from spectrogram input and generic pre-trained features by introducing two regularization methods that integrate the information of both feature classes. The workload is kept low during inference as the pre-trained features are only necessary for training. In experiments with the pre-trained features VGGish, OpenL3, and a combination of both, we show that the proposed methods not only outperform baseline methods, but also can improve state-of-the-art models on several audio classification tasks. The results also suggest that using the mixture of features performs better than using individual features.},
keywords = {Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2021
@inproceedings{watcharasupat_evaluation_2021,
title = {Evaluation of Latent Space Disentanglement in the Presence of Interdependent Attributes},
author = {Karn N Watcharasupat and Alexander Lerch},
url = {http://arxiv.org/abs/2110.05587},
year = {2021},
date = {2021-10-01},
urldate = {2021-11-11},
booktitle = {Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Online},
abstract = {Controllable music generation with deep generative models has become increasingly reliant on disentanglement learning techniques. However, current disentanglement metrics, such as mutual information gap (MIG), are often inadequate and misleading when used for evaluating latent representations in the presence of interdependent semantic attributes often encountered in real-world music datasets. In this work, we propose a dependency-aware information metric as a drop-in replacement for MIG that accounts for the inherent relationship between semantic attributes.},
keywords = {Computer Science - Information Retrieval, Computer Science - Information Theory, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
publications
Audio Embeddings as Teachers for Music Classification Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Milan, Italy, 2023. Representation Learning for the Automatic Indexing of Sound Effects Libraries Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Bangalore, IN, 2022, (arXiv:2208.09096 [cs, eess]). Evaluating Generative Audio Systems and their Metrics Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Bangalore, IN, 2022, (arXiv:2209.00130 [cs, eess]). Scream Detection in Heavy Metal Music Proceedings Article In: Proceedings of the Sound and Music Computing Conference (SMC), Saint-Etienne, 2022. Feature-informed Embedding Space Regularization for Audio Classification Proceedings Article In: Proceedings of the European Signal Processing Conference (EUSIPCO), Belgrade, Serbia, 2022. Evaluation of Latent Space Disentanglement in the Presence of Interdependent Attributes Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Online, 2021.2023
2022
2021