Kim, Yonghyun; Park, Junhyung; Bae, Joonhyung; Kim, Kirak; Kwon, Taegyun; Lerch, Alexander; Nam, Juhan PianoVAM: A Multimodal Piano Performance Dataset Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Kim, Yonghyun; Han, Chaeyeon; Sarode, Akash; Posner, Noah; Guhathakurta, Subhrajit; Lerch, Alexander Audio-Based Pedestrian Detection in the Presence of Vehicular Noise Proceedings Article In: Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE), Barcelona, Spain, 2025. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Watcharasupat, Karn N.; Ding, Yiwei; Ma, T. Aleksandra; Seshadri, Pavan; Lerch, Alexander Uncertainty Estimation in the Real World: A Study on Music Emotion Recognition Proceedings Article In: Proceedings of the European Conference on Information Retrieval (ECIR), arXiv, Lucca, Italy, 2025. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Kim, Yonghyun; Lerch, Alexander Towards Robust Transcription: Exploring Noise Injection Strategies for Training Data Augmentation Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), arXiv, San Francisco, 2024. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Han, Chaeyeon; Seshadri, Pavan; Ding, Yiwei; Posner, Noah; Koo, Bon Woo; Agrawal, Animesh; Lerch, Alexander; Guhathakurta, Subhrajit Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors Journal Article In: Urban Informatics, vol. 3, no. 1, pp. 22, 2024, ISSN: 2731-6963. Abstract | Links | BibTeX | Tags: Active mobility, Audio-based, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Pedestrian, Sensors Watcharasupat, Karn N.; Lerch, Alexander A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), San Francisco, 2024. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing2025
@inproceedings{kim_pianovam_2025,
title = {PianoVAM: A Multimodal Piano Performance Dataset},
author = {Yonghyun Kim and Junhyung Park and Joonhyung Bae and Kirak Kim and Taegyun Kwon and Alexander Lerch and Juhan Nam},
url = {http://arxiv.org/abs/2509.08800},
doi = {10.48550/arXiv.2509.08800},
year = {2025},
date = {2025-09-01},
urldate = {2025-09-19},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Daejeon, South Korea},
abstract = {The multimodal nature of music performance has driven increasing interest in data beyond the audio domain within the music information retrieval (MIR) community. This paper introduces PianoVAM, a comprehensive piano performance dataset that includes videos, audio, MIDI, hand landmarks, fingering labels, and rich metadata. The dataset was recorded using a Disklavier piano, capturing audio and MIDI from amateur pianists during their daily practice sessions, alongside synchronized top-view videos in realistic and varied performance conditions. Hand landmarks and fingering labels were extracted using a pretrained hand pose estimation model and a semi-automated fingering annotation algorithm. We discuss the challenges encountered during data collection and the alignment process across different modalities. Additionally, we describe our fingering annotation method based on hand landmarks extracted from videos. Finally, we present benchmarking results for both audio-only and audio-visual piano transcription using the PianoVAM dataset and discuss additional potential applications.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{kim_audio-based_2025,
title = {Audio-Based Pedestrian Detection in the Presence of Vehicular Noise},
author = {Yonghyun Kim and Chaeyeon Han and Akash Sarode and Noah Posner and Subhrajit Guhathakurta and Alexander Lerch},
url = {http://arxiv.org/abs/2509.19295},
doi = {10.48550/arXiv.2509.19295},
year = {2025},
date = {2025-09-01},
urldate = {2025-09-24},
booktitle = {Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE)},
address = {Barcelona, Spain},
abstract = {Audio-based pedestrian detection is a challenging task and has, thus far, only been explored in noise-limited environments. We present a new dataset, results, and a detailed analysis of the state-of-the-art in audio-based pedestrian detection in the presence of vehicular noise. In our study, we conduct three analyses: (i) cross-dataset evaluation between noisy and noise-limited environments, (ii) an assessment of the impact of noisy data on model performance, highlighting the influence of acoustic context, and (iii) an evaluation of the model's predictive robustness on out-of-domain sounds. The new dataset is a comprehensive 1321-hour roadside dataset. It incorporates traffic-rich soundscapes. Each recording includes 16kHz audio synchronized with frame-level pedestrian annotations and 1fps video thumbnails.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{watcharasupat_uncertainty_2025,
title = {Uncertainty Estimation in the Real World: A Study on Music Emotion Recognition},
author = {Karn N. Watcharasupat and Yiwei Ding and T. Aleksandra Ma and Pavan Seshadri and Alexander Lerch},
url = {http://arxiv.org/abs/2501.11570},
doi = {10.48550/arXiv.2501.11570},
year = {2025},
date = {2025-01-01},
urldate = {2025-01-30},
booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR)},
publisher = {arXiv},
address = {Lucca, Italy},
abstract = {Any data annotation for subjective tasks shows potential variations between individuals. This is particularly true for annotations of emotional responses to musical stimuli. While older approaches to music emotion recognition systems frequently addressed this uncertainty problem through probabilistic modeling, modern systems based on neural networks tend to ignore the variability and focus only on predicting central tendencies of human subjective responses. In this work, we explore several methods for estimating not only the central tendencies of the subjective responses to a musical stimulus, but also for estimating the uncertainty associated with these responses. In particular, we investigate probabilistic loss functions and inference-time random sampling. Experimental results indicate that while the modeling of the central tendencies is achievable, modeling of the uncertainty in subjective responses proves significantly more challenging with currently available approaches even when empirical estimates of variations in the responses are available.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
@inproceedings{kim_towards_2024,
title = {Towards Robust Transcription: Exploring Noise Injection Strategies for Training Data Augmentation},
author = {Yonghyun Kim and Alexander Lerch},
url = {http://arxiv.org/abs/2410.14122},
doi = {10.48550/arXiv.2410.14122},
year = {2024},
date = {2024-10-01},
urldate = {2024-10-25},
booktitle = {Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
publisher = {arXiv},
address = {San Francisco},
abstract = {Recent advancements in Automatic Piano Transcription (APT) have significantly improved system performance, but the impact of noisy environments on the system performance remains largely unexplored. This study investigates the impact of white noise at various Signal-to-Noise Ratio (SNR) levels on state-of-the-art APT models and evaluates the performance of the Onsets and Frames model when trained on noise-augmented data. We hope this research provides valuable insights as preliminary work toward developing transcription models that maintain consistent performance across a range of acoustic conditions.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@article{han_understanding_2024,
title = {Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors},
author = {Chaeyeon Han and Pavan Seshadri and Yiwei Ding and Noah Posner and Bon Woo Koo and Animesh Agrawal and Alexander Lerch and Subhrajit Guhathakurta},
url = {https://doi.org/10.1007/s44212-024-00053-9},
doi = {10.1007/s44212-024-00053-9},
issn = {2731-6963},
year = {2024},
date = {2024-07-01},
urldate = {2024-07-10},
journal = {Urban Informatics},
volume = {3},
number = {1},
pages = {22},
abstract = {While various sensors have been deployed to monitor vehicular flows, sensing pedestrian movement is still nascent. Yet walking is a significant mode of travel in many cities, especially those in Europe, Africa, and Asia. Understanding pedestrian volumes and flows is essential for designing safer and more attractive pedestrian infrastructure and for controlling periodic overcrowding. This study discusses a new approach to scale up urban sensing of people with the help of novel audio-based technology. It assesses the benefits and limitations of microphone-based sensors as compared to other forms of pedestrian sensing. A large-scale dataset called ASPED is presented, which includes high-quality audio recordings along with video recordings used for labeling the pedestrian count data. The baseline analyses highlight the promise of using audio sensors for pedestrian tracking, although algorithmic and technological improvements to make the sensors practically usable continue. This study also demonstrates how the data can be leveraged to predict pedestrian trajectories. Finally, it discusses the use cases and scenarios where audio-based pedestrian sensing can support better urban and transportation planning.},
keywords = {Active mobility, Audio-based, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Pedestrian, Sensors},
pubstate = {published},
tppubtype = {article}
}
@inproceedings{watcharasupat_stem-agnostic_2024,
title = {A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems},
author = {Karn N. Watcharasupat and Alexander Lerch},
url = {http://arxiv.org/abs/2406.18747},
doi = {10.48550/arXiv.2406.18747},
year = {2024},
date = {2024-06-01},
urldate = {2024-08-08},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {San Francisco},
abstract = {Despite significant recent progress across multiple subtasks of audio source separation, few music source separation systems support separation beyond the four-stem vocals, drums, bass, and other (VDBO) setup. Of the very few current systems that support source separation beyond this setup, most continue to rely on an inflexible decoder setup that can only support a fixed pre-defined set of stems. Increasing stem support in these inflexible systems correspondingly requires increasing computational complexity, rendering extensions of these systems computationally infeasible for long-tail instruments. In this work, we propose Banquet, a system that allows source separation of multiple stems using just one decoder. A bandsplit source separation model is extended to work in a query-based setup in tandem with a music instrument recognition PaSST model. On the MoisesDB dataset, Banquet, at only 24.9 M trainable parameters, approached the performance level of the significantly more complex 6-stem Hybrid Transformer Demucs on VDBO stems and outperformed it on guitar and piano. The query-based setup allows for the separation of narrow instrument classes such as clean acoustic guitars, and can be successfully applied to the extraction of less common stems such as reeds and organs. Implementation is available at https://github.com/kwatcharasupat/query-bandit.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Information Retrieval, Computer Science - Machine Learning, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
publications
PianoVAM: A Multimodal Piano Performance Dataset Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Audio-Based Pedestrian Detection in the Presence of Vehicular Noise Proceedings Article In: Workshop on Detection and Classification of Acoustic Scenes and Events (DCASE), Barcelona, Spain, 2025. Uncertainty Estimation in the Real World: A Study on Music Emotion Recognition Proceedings Article In: Proceedings of the European Conference on Information Retrieval (ECIR), arXiv, Lucca, Italy, 2025. Towards Robust Transcription: Exploring Noise Injection Strategies for Training Data Augmentation Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), arXiv, San Francisco, 2024. Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors Journal Article In: Urban Informatics, vol. 3, no. 1, pp. 22, 2024, ISSN: 2731-6963. A Stem-Agnostic Single-Decoder System for Music Source Separation Beyond Four Stems Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), San Francisco, 2024.2025
2024