@InProceedings{ GhinassiEtAl23ICME,
  author = "Ghinassi, Iacopo and Purver, Matthew and Phan, Huy and Newell, Chris",
  booktitle = "IEEE International Conference on Multimedia and Expo (ICME)",
  title = "Exploring Pre-Trained Neural Audio Representations for Audio Topic Segmentation",
  year = 2023,
  pages = "1086-1091",
  doi = "10.1109/ICME55011.2023.00190",
  url = "https://doi.org/10.1109/ICME55011.2023.00190",
  url = "https://doi.ieeecomputersociety.org/10.1109/ICME55011.2023.00190",
  publisher = "IEEE Computer Society",
  address = "Los Alamitos, CA",
  month = jul,
  abstract = {Recent works have shown that audio embeddings can improve automatic topic segmentation of formats such as radio shows. In this work we expand the work in that direction by showing how and which publicly available, pre-trained neural audio embeddings can perform the task, without the need of any further fine-tuning of the audio encoders. The ranking of the encoders suggest that neural encoders pre-trained for speaker diarization and general purpose audio classification are the best suited to be used as features, beating non-neural baselines. We show that we can obtain perfect results on a newly created random dataset similar to the one used in previous work. We also show for the first time results on real-world data, proving that our method can be applied to actual radio shows with good results, but the choice of audio encoders is extremely important in order to achieve those. Finally, by releasing the datasets we used we make the contribution of providing the first (to our knowledge) publicly available, free of charge datasets for audio topic segmentation of media products.},
keywords = {media;encoding;task analysis;digital audio broadcasting;synthetic data},
}