dynamic-superb-utils's People
dynamic-superb-utils's Issues
Download dataset and build subset
from datasets import load_dataset, Dataset, disable_caching, load_from_disk
from pathlib import Path
import logging
disable_caching()
def main():
logging.basicConfig(filename="test_download.log", level=logging.INFO, format="[%(asctime)s] [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S")
data_path = Path("/work/u8915687/big-superb")
# training datasets
# all_datasets = ['BigSuperbPrivate/SpoofDetection_Asvspoof2017', 'BigSuperbPrivate/DailyTalk_DialogueActClassification', 'BigSuperbPrivate/PronounciationEvaluationProsodic_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationFluency_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationOverall_Speechocean762', 'BigSuperbPrivate/PronounciationEvaluationAccuracy_Speechocean762', 'BigSuperbPrivate/HowFarAreYou_DeeplyParentChildVocalInteraction', 'BigSuperbPrivate/HowFarAreYou_KoreanReadSpeechCorpus', 'BigSuperbPrivate/SpeakerVerification_Tedlium2Train', 'BigSuperbPrivate/SpeechDetection_Aishell1Train', 'BigSuperbPrivate/SpeakerVerification_LibrispeechTrainClean100', 'BigSuperbPrivate/SpeakerVerification_Aishell1Train', 'BigSuperbPrivate/SpeechDetection_Voxceleb1Train', 'BigSuperbPrivate/SpeakerVerification_Voxceleb1Train', 'BigSuperbPrivate/SpokenTermDetection_Tedlium2Train', 'BigSuperbPrivate/NoiseSNRLevelPredictionSpeech_VoxcelebMusan', 'BigSuperbPrivate/SpeechDetection_LibrispeechTrainClean100', 'BigSuperbPrivate/NoiseSNRLevelPredictionNoise_VoxcelebMusan', 'BigSuperbPrivate/SpeechDetection_Tedlium2Train', 'BigSuperbPrivate/EnhancementDetection_LibrittsTrainClean360Wham', 'BigSuperbPrivate/SpeakerCounting_LibrittsTrainClean100', 'BigSuperbPrivate/NoiseSNRLevelPredictionGaussian_VoxcelebMusan', 'BigSuperbPrivate/NoiseSNRLevelPredictionMusic_VoxcelebMusan', 'BigSuperbPrivate/SpeechTextMatching_Tedlium2Train', 'BigSuperbPrivate/ReverberationDetectionSmallRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/ReverberationDetectionMediumRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/SpokenTermDetection_LibrispeechTrainClean100', 'BigSuperbPrivate/ReverberationDetectionLargeRoom_VoxcelebRirsNoises', 'BigSuperbPrivate/NoiseDetectionSpeech_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionNoise_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionMusic_VoxcelebMusan', 'BigSuperbPrivate/NoiseDetectionGaussian_VoxcelebMusan', 'BigSuperbPrivate/SpoofDetection_ASVspoof2015', 'BigSuperbPrivate/SpeechTextMatching_LibrispeechTrainClean100']
# testing datasets
all_datasets = ['SpeechBigBench/AccentClassification_AccentdbExtended', 'SpeechBigBench/BirdSoundDetection_Warblrb10k', 'SpeechBigBench/ChordClassification_AcousticGuitarAndPiano', 'SpeechBigBench/Deeply_Parent_Child_Vocal_Interaction', 'SpeechBigBench/DialogueActClassification_DailyTalk', 'SpeechBigBench/DialogueEmotionClassification_DailyTalk', 'SpeechBigBench/EmotionRecognition_MultimodalEmotionlinesDataset', 'SpeechBigBench/EnhancementDetection_LibrittsTestCleanWham', 'SpeechBigBench/EnvironmentalSoundClassification_AnimalsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_ExteriorAndUrbanNoisesESC50', 'SpeechBigBench/EnvironmentalSoundClassification_HumanAndNonSpeechSoundsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_InteriorAndDomesticSoundsESC50', 'SpeechBigBench/EnvironmentalSoundClassification_NaturalSoundscapesAndWaterSoundsESC50', 'SpeechBigBench/HowFarAreYou_3DSpeaker', 'SpeechBigBench/IntentClassification_FluentSpeechCommands', 'SpeechBigBench/Korean_Read_Speech_Corpus', 'SpeechBigBench/LanguageIdentification_VoxForge', 'SpeechBigBench/NoiseDetectiongaussian_LJSpeechMusan', 'SpeechBigBench/NoiseDetectiongaussian_VCTKMusan', 'SpeechBigBench/NoiseDetectionmusic_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionmusic_VCTKMusan', 'SpeechBigBench/NoiseDetectionnoise_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionnoise_VCTKMusan', 'SpeechBigBench/NoiseDetectionspeech_LJSpeechMusan', 'SpeechBigBench/NoiseDetectionspeech_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictiongaussian_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionmusic_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionnoise_VCTKMusan', 'SpeechBigBench/NoiseSNRLevelPredictionspeech_VCTKMusan', 'SpeechBigBench/Nonverbal_Vocalization', 'SpeechBigBench/PronounciationEvaluationAccuracy_Speechocean762', 'SpeechBigBench/PronounciationEvaluationFluency_Speechocean762', 'SpeechBigBench/PronounciationEvaluationOverall_Speechocean762', 'SpeechBigBench/PronounciationEvaluationProsodic_Speechocean762', 'SpeechBigBench/ReverberationDetectionlargeroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionlargeroom_VCTKRirsNoises', 'SpeechBigBench/ReverberationDetectionmediumroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionmediumroom_VCTKRirsNoises', 'SpeechBigBench/ReverberationDetectionsmallroom_LJSpeechRirsNoises', 'SpeechBigBench/ReverberationDetectionsmallroom_VCTKRirsNoises', 'SpeechBigBench/SarcasmDetection_Mustard', 'SpeechBigBench/SpeakerCounting_LibriTTSTestClean', 'SpeechBigBench/SpeechCommandRecognition_GoogleSpeechCommandsV1', 'SpeechBigBench/SpeechDetection_LJSpeech', 'SpeechBigBench/SpeechDetection_LibriSpeechTestClean', 'SpeechBigBench/SpeechDetection_LibriSpeechTestOther', 'SpeechBigBench/SpeechTextMatching_LJSpeech', 'SpeechBigBench/SpeechTextMatching_LibriSpeechTestClean', 'SpeechBigBench/SpeechTextMatching_LibriSpeechTestOther', 'SpeechBigBench/SpokenTermDetection_LJSpeech', 'SpeechBigBench/SpokenTermDetection_LibriSpeechTestClean', 'SpeechBigBench/SpokenTermDetection_LibriSpeechTestOther', 'SpeechBigBench/SpoofDetection_ASVspoof2015', 'SpeechBigBench/SpoofDetection_ASVspoof2017', 'SpeechBigBench/StressDetection_MIRSD', 'SpeechBigBench/arabic_speech_corpus', 'SpeechBigBench/speech_commands']
subset_size = 1000
for dataset_name in all_datasets:
logging.info(f"==> Start {dataset_name}")
# Download dataset to disk
if (data_path/dataset_name).exists():
logging.info(f"==> SKIP {dataset_name}", )
dataset = load_from_disk(data_path/dataset_name)
else:
logging.info(f"==> Not yet downloaded {dataset_name}")
dataset = load_dataset(dataset_name)
logging.info(f"==> Save original {dataset_name}")
(data_path/dataset_name).mkdir(parents=True, exist_ok=True)
dataset.save_to_disk(data_path/dataset_name)
# Build subset
logging.info(f"==> Subset {dataset_name}")
if dataset.get("test"):
dataset = dataset["test"].shuffle(seed=42)
dataset = Dataset.from_dict(dataset[:subset_size])
logging.info(len(dataset))
logging.info(f"==> Save test1000 {dataset_name}")
(data_path/(dataset_name+"_test1000")).mkdir(parents=True, exist_ok=True)
dataset.save_to_disk(data_path/(dataset_name+"_test1000"))
else:
logging.warning(f"No test in {dataset_name}")
logging.info("="*100)
if __name__ == "__main__":
main()
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
๐ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. ๐๐๐
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google โค๏ธ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.