Coder Social home page Coder Social logo

dynamic-superb-utils's People

Contributors

cyhuang-tw avatar

Watchers

 avatar

dynamic-superb-utils's Issues

Download dataset and build subset

from datasets import load_dataset, Dataset, disable_caching, load_from_disk
from pathlib import Path
import logging

disable_caching()

def main():

    logging.basicConfig(filename="test_download.log", level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

    data_path = Path("/work/u8915687/big-superb")

    # training datasets
    # all_datasets = ['BigSuperbPrivate/SpoofDetection_Asvspoof2017', 'BigSuperbPrivate/DailyTalk_DialogueActClassification', 'BigSuperbPrivate/PronounciationEvaluationProsodic_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationFluency_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationOverall_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationAccuracy_Speechocean762', 'BigSuperbPrivate/HowFarAreYou_DeeplyParentChildVocalInteraction', 'BigSuperbPrivate/HowFarAreYou_KoreanReadSpeechCorpus', 'BigSuperbPrivate/SpeakerVerification_Tedlium2Train', 'BigSuperbPrivate/SpeechDetection_Aishell1Train', 'BigSuperbPrivate/SpeakerVerification_LibrispeechTrainClean100', 'BigSuperbPrivate/SpeakerVerification_Aishell1Train', 'BigSuperbPrivate/SpeechDetection_Voxceleb1Train', 'BigSuperbPrivate/SpeakerVerification_Voxceleb1Train', 'BigSuperbPrivate/SpokenTermDetection_Tedlium2Train', 'BigSuperbPrivate/NoiseSNRLevelPredictionSpeech_VoxcelebMusan', 'BigSuperbPrivate/SpeechDetection_LibrispeechTrainClean100', 'BigSuperbPrivate/NoiseSNRLevelPredictionNoise_VoxcelebMusan', 'BigSuperbPrivate/SpeechDetection_Tedlium2Train', 'BigSuperbPrivate/EnhancementDetection_LibrittsTrainClean360Wham', 'BigSuperbPrivate/SpeakerCounting_LibrittsTrainClean100', 'BigSuperbPrivate/NoiseSNRLevelPredictionGaussian_VoxcelebMusan', 'BigSuperbPrivate/NoiseSNRLevelPredictionMusic_VoxcelebMusan', 'BigSuperbPrivate/SpeechTextMatching_Tedlium2Train', 'BigSuperbPrivate/ReverberationDetectionSmallRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/ReverberationDetectionMediumRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/SpokenTermDetection_LibrispeechTrainClean100', 'BigSuperbPrivate/ReverberationDetectionLargeRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/NoiseDetectionSpeech_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionNoise_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionMusic_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionGaussian_VoxcelebMusan', 'BigSuperbPrivate/SpoofDetection_ASVspoof2015', 'BigSuperbPrivate/SpeechTextMatching_LibrispeechTrainClean100']

    # testing datasets
    all_datasets = ['SpeechBigBench/AccentClassification_AccentdbExtended', 'SpeechBigBench/BirdSoundDetection_Warblrb10k', 'SpeechBigBench/ChordClassification_AcousticGuitarAndPiano', 'SpeechBigBench/Deeply_Parent_Child_Vocal_Interaction', 'SpeechBigBench/DialogueActClassification_DailyTalk', 'SpeechBigBench/DialogueEmotionClassification_DailyTalk', 'SpeechBigBench/EmotionRecognition_MultimodalEmotionlinesDataset', 'SpeechBigBench/EnhancementDetection_LibrittsTestCleanWham', 'SpeechBigBench/EnvironmentalSoundClassification_AnimalsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_ExteriorAndUrbanNoisesESC50', 'SpeechBigBench/EnvironmentalSoundClassification_HumanAndNonSpeechSoundsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_InteriorAndDomesticSoundsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_NaturalSoundscapesAndWaterSoundsESC50', 'SpeechBigBench/HowFarAreYou_3DSpeaker', 'SpeechBigBench/IntentClassification_FluentSpeechCommands', 'SpeechBigBench/Korean_Read_Speech_Corpus', 'SpeechBigBench/LanguageIdentification_VoxForge', 'SpeechBigBench/NoiseDetectiongaussian_LJSpeechMusan', 'SpeechBigBench/NoiseDetectiongaussian_VCTKMusan', 'SpeechBigBench/NoiseDetectionmusic_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionmusic_VCTKMusan', 'SpeechBigBench/NoiseDetectionnoise_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionnoise_VCTKMusan', 'SpeechBigBench/NoiseDetectionspeech_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionspeech_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictiongaussian_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionmusic_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionnoise_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionspeech_VCTKMusan', 'SpeechBigBench/Nonverbal_Vocalization', 'SpeechBigBench/PronounciationEvaluationAccuracy_Speechocean762', 'SpeechBigBench/PronounciationEvaluationFluency_Speechocean762', 'SpeechBigBench/PronounciationEvaluationOverall_Speechocean762', 'SpeechBigBench/PronounciationEvaluationProsodic_Speechocean762', 'SpeechBigBench/ReverberationDetectionlargeroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionlargeroom_VCTKRirsNoises', 'SpeechBigBench/ReverberationDetectionmediumroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionmediumroom_VCTKRirsNoises', 'SpeechBigBench/ReverberationDetectionsmallroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionsmallroom_VCTKRirsNoises', 'SpeechBigBench/SarcasmDetection_Mustard', 'SpeechBigBench/SpeakerCounting_LibriTTSTestClean', 'SpeechBigBench/SpeechCommandRecognition_GoogleSpeechCommandsV1', 'SpeechBigBench/SpeechDetection_LJSpeech', 'SpeechBigBench/SpeechDetection_LibriSpeechTestClean', 'SpeechBigBench/SpeechDetection_LibriSpeechTestOther', 'SpeechBigBench/SpeechTextMatching_LJSpeech', 'SpeechBigBench/SpeechTextMatching_LibriSpeechTestClean', 'SpeechBigBench/SpeechTextMatching_LibriSpeechTestOther', 'SpeechBigBench/SpokenTermDetection_LJSpeech', 'SpeechBigBench/SpokenTermDetection_LibriSpeechTestClean', 'SpeechBigBench/SpokenTermDetection_LibriSpeechTestOther', 'SpeechBigBench/SpoofDetection_ASVspoof2015', 'SpeechBigBench/SpoofDetection_ASVspoof2017', 'SpeechBigBench/StressDetection_MIRSD', 'SpeechBigBench/arabic_speech_corpus', 'SpeechBigBench/speech_commands']

    subset_size = 1000
    for dataset_name in all_datasets:
        logging.info(f"==> Start {dataset_name}")

        # Download dataset to disk
        if (data_path/dataset_name).exists():
            logging.info(f"==> SKIP {dataset_name}", )
            dataset = load_from_disk(data_path/dataset_name)
        else:
            logging.info(f"==> Not yet downloaded {dataset_name}")
            dataset = load_dataset(dataset_name)

            logging.info(f"==> Save original {dataset_name}")
            (data_path/dataset_name).mkdir(parents=True, exist_ok=True)
            dataset.save_to_disk(data_path/dataset_name)

        # Build subset
        logging.info(f"==> Subset {dataset_name}")
        if dataset.get("test"):
            dataset = dataset["test"].shuffle(seed=42)
            dataset = Dataset.from_dict(dataset[:subset_size])
            logging.info(len(dataset))
            

            logging.info(f"==> Save test1000 {dataset_name}")
            (data_path/(dataset_name+"_test1000")).mkdir(parents=True, exist_ok=True)
            dataset.save_to_disk(data_path/(dataset_name+"_test1000"))
        else:
            logging.warning(f"No test in {dataset_name}")
            
        logging.info("="*100)

if __name__ == "__main__":
    main()

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    ๐Ÿ–– Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. ๐Ÿ“Š๐Ÿ“ˆ๐ŸŽ‰

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google โค๏ธ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.