Kim, Yonghyun; Park, Junhyung; Bae, Joonhyung; Kim, Kirak; Kwon, Taegyun; Lerch, Alexander; Nam, Juhan PianoVAM: A Multimodal Piano Performance Dataset Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Abstract | Links | BibTeX | Tags: Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing Park, Junhyung; Kim, Yonghyun; Bae, Joonhyung; Kim, Kirak; Kwon, Taegyun; Lerch, Alexander; Nam, Juhan Two Web Toolkits for Multimodal Piano Performance Dataset Acquisition and Fingering Annotation Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Abstract | Links | BibTeX | Tags: Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Electrical Engineering and Systems Science - Image and Video Processing Han, Chaeyeon; Seshadri, Pavan; Ding, Yiwei; Posner, Noah; Koo, Bon Woo; Agrawal, Animesh; Lerch, Alexander; Guhathakurta, Subhrajit Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors Journal Article In: Urban Informatics, vol. 3, no. 1, pp. 22, 2024, ISSN: 2731-6963. Abstract | Links | BibTeX | Tags: Active mobility, Audio-based, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Pedestrian, Sensors2025
@inproceedings{kim_pianovam_2025,
title = {PianoVAM: A Multimodal Piano Performance Dataset},
author = {Yonghyun Kim and Junhyung Park and Joonhyung Bae and Kirak Kim and Taegyun Kwon and Alexander Lerch and Juhan Nam},
url = {http://arxiv.org/abs/2509.08800},
doi = {10.48550/arXiv.2509.08800},
year = {2025},
date = {2025-09-01},
urldate = {2025-09-19},
booktitle = {Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Daejeon, South Korea},
abstract = {The multimodal nature of music performance has driven increasing interest in data beyond the audio domain within the music information retrieval (MIR) community. This paper introduces PianoVAM, a comprehensive piano performance dataset that includes videos, audio, MIDI, hand landmarks, fingering labels, and rich metadata. The dataset was recorded using a Disklavier piano, capturing audio and MIDI from amateur pianists during their daily practice sessions, alongside synchronized top-view videos in realistic and varied performance conditions. Hand landmarks and fingering labels were extracted using a pretrained hand pose estimation model and a semi-automated fingering annotation algorithm. We discuss the challenges encountered during data collection and the alignment process across different modalities. Additionally, we describe our fingering annotation method based on hand landmarks extracted from videos. Finally, we present benchmarking results for both audio-only and audio-visual piano transcription using the PianoVAM dataset and discuss additional potential applications.},
keywords = {Computer Science - Artificial Intelligence, Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
@inproceedings{park_two_2025,
title = {Two Web Toolkits for Multimodal Piano Performance Dataset Acquisition and Fingering Annotation},
author = {Junhyung Park and Yonghyun Kim and Joonhyung Bae and Kirak Kim and Taegyun Kwon and Alexander Lerch and Juhan Nam},
url = {http://arxiv.org/abs/2509.15222},
doi = {10.48550/arXiv.2509.15222},
year = {2025},
date = {2025-09-01},
urldate = {2025-09-20},
booktitle = {Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR)},
address = {Daejeon, South Korea},
abstract = {Piano performance is a multimodal activity that intrinsically combines physical actions with the acoustic rendition. Despite growing research interest in analyzing the multimodal nature of piano performance, the laborious process of acquiring large-scale multimodal data remains a significant bottleneck, hindering further progress in this field. To overcome this barrier, we present an integrated web toolkit comprising two graphical user interfaces (GUIs): (i) PiaRec, which supports the synchronized acquisition of audio, video, MIDI, and performance metadata. (ii) ASDF, which enables the efficient annotation of performer fingering from the visual data. Collectively, this system can streamline the acquisition of multimodal piano performance datasets.},
keywords = {Computer Science - Computer Vision and Pattern Recognition, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Electrical Engineering and Systems Science - Image and Video Processing},
pubstate = {published},
tppubtype = {inproceedings}
}
2024
@article{han_understanding_2024,
title = {Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors},
author = {Chaeyeon Han and Pavan Seshadri and Yiwei Ding and Noah Posner and Bon Woo Koo and Animesh Agrawal and Alexander Lerch and Subhrajit Guhathakurta},
url = {https://doi.org/10.1007/s44212-024-00053-9},
doi = {10.1007/s44212-024-00053-9},
issn = {2731-6963},
year = {2024},
date = {2024-07-01},
urldate = {2024-07-10},
journal = {Urban Informatics},
volume = {3},
number = {1},
pages = {22},
abstract = {While various sensors have been deployed to monitor vehicular flows, sensing pedestrian movement is still nascent. Yet walking is a significant mode of travel in many cities, especially those in Europe, Africa, and Asia. Understanding pedestrian volumes and flows is essential for designing safer and more attractive pedestrian infrastructure and for controlling periodic overcrowding. This study discusses a new approach to scale up urban sensing of people with the help of novel audio-based technology. It assesses the benefits and limitations of microphone-based sensors as compared to other forms of pedestrian sensing. A large-scale dataset called ASPED is presented, which includes high-quality audio recordings along with video recordings used for labeling the pedestrian count data. The baseline analyses highlight the promise of using audio sensors for pedestrian tracking, although algorithmic and technological improvements to make the sensors practically usable continue. This study also demonstrates how the data can be leveraged to predict pedestrian trajectories. Finally, it discusses the use cases and scenarios where audio-based pedestrian sensing can support better urban and transportation planning.},
keywords = {Active mobility, Audio-based, Computer Science - Artificial Intelligence, Computer Science - Machine Learning, Computer Science - Multimedia, Computer Science - Sound, Electrical Engineering and Systems Science - Audio and Speech Processing, Pedestrian, Sensors},
pubstate = {published},
tppubtype = {article}
}
publications
PianoVAM: A Multimodal Piano Performance Dataset Proceedings Article In: Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Two Web Toolkits for Multimodal Piano Performance Dataset Acquisition and Fingering Annotation Proceedings Article In: Late Breaking Demo (Extended Abstract), Proceedings of the International Society for Music Information Retrieval Conference (ISMIR), Daejeon, South Korea, 2025. Understanding Pedestrian Movement Using Urban Sensing Technologies: The Promise of Audio-based Sensors Journal Article In: Urban Informatics, vol. 3, no. 1, pp. 22, 2024, ISSN: 2731-6963.2025
2024