diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Dataset/get_dataset.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Dataset/get_dataset.py new file mode 100644 index 0000000000..f30a8d06e7 --- /dev/null +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Dataset/get_dataset.py @@ -0,0 +1,35 @@ +import os +import shutil +import argparse +from datasets import load_dataset +from tqdm import tqdm + +language_to_code = { + "japanese": "ja", + "swedish": "sv-SE" +} + +def download_dataset(output_dir): + for lang, lang_code in language_to_code.items(): + print(f"Processing dataset for language: {lang_code}") + + # Load the dataset for the specific language + dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="train", trust_remote_code=True) + + # Create a language-specific output folder + output_folder = os.path.join(output_dir, lang, lang_code, "clips") + os.makedirs(output_folder, exist_ok=True) + + # Extract and copy MP3 files + for sample in tqdm(dataset, desc=f"Extracting and copying MP3 files for {lang}"): + audio_path = sample['audio']['path'] + shutil.copy(audio_path, output_folder) + + print("Extraction and copy complete.") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Extract and copy audio files from a dataset to a specified directory.") + parser.add_argument("--output_dir", type=str, default="/data/commonVoice", help="Base output directory for saving the files. Default is /data/commonVoice") + args = parser.parse_args() + + download_dataset(args.output_dir) \ No newline at end of file diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/clean.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/clean.sh index 7ea1719af4..34747af45c 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/clean.sh +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/clean.sh @@ -1,7 +1,5 @@ #!/bin/bash -rm -R RIRS_NOISES -rm -R tmp -rm -R speechbrain -rm -f rirs_noises.zip noise.csv reverb.csv vad_file.txt +echo "Deleting .wav files, tmp" rm -f ./*.wav +rm -R tmp diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_commonVoice.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_commonVoice.py index 6442418bf0..7effb2df76 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_commonVoice.py +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_commonVoice.py @@ -29,7 +29,7 @@ def __init__(self, dirpath, filename): self.sampleRate = 0 self.waveData = '' self.wavesize = 0 - self.waveduriation = 0 + self.waveduration = 0 if filename.endswith(".wav") or filename.endswith(".wmv"): self.wavefile = filename self.wavepath = dirpath + os.sep + filename @@ -173,12 +173,12 @@ def main(argv): data = datafile(testDataDirectory, filename) predict_list = [] use_entire_audio_file = False - if data.waveduration < sample_dur: + if int(data.waveduration) <= sample_dur: # Use entire audio file if the duration is less than the sampling duration use_entire_audio_file = True sample_list = [0 for _ in range(sample_size)] else: - start_time_list = list(range(sample_size - int(data.waveduration) + 1)) + start_time_list = list(range(0, int(data.waveduration) - sample_dur)) sample_list = [] for i in range(sample_size): sample_list.append(random.sample(start_time_list, 1)[0]) @@ -198,10 +198,6 @@ def main(argv): predict_list.append(' ') pass - # Clean up - if use_entire_audio_file: - os.remove("./" + data.filename) - # Pick the top rated prediction result occurence_count = Counter(predict_list) total_count = sum(occurence_count.values()) diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py index b4f9d6adee..2b4a331c0b 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/inference_custom.py @@ -30,7 +30,7 @@ def __init__(self, dirpath, filename): self.sampleRate = 0 self.waveData = '' self.wavesize = 0 - self.waveduriation = 0 + self.waveduration = 0 if filename.endswith(".wav") or filename.endswith(".wmv"): self.wavefile = filename self.wavepath = dirpath + os.sep + filename @@ -61,41 +61,45 @@ def __init__(self, ipex_op=False, bf16=False, int8_model=False): self.model_int8 = load(source_model_int8_path, self.language_id) self.model_int8.eval() elif ipex_op: + self.language_id.eval() + # Optimize for inference with IPEX print("Optimizing inference with IPEX") - self.language_id.eval() - sampleInput = (torch.load("./sample_input_features.pt"), torch.load("./sample_input_wav_lens.pt")) if bf16: print("BF16 enabled") self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"], dtype=torch.bfloat16) self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"], dtype=torch.bfloat16) - self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"], dtype=torch.bfloat16) self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"], dtype=torch.bfloat16) else: self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"]) self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"]) - self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"]) self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"]) # Torchscript to resolve performance issues with reorder operations + print("Applying Torchscript") + sampleWavs = torch.load("./sample_wavs.pt") + sampleWavLens = torch.ones(sampleWavs.shape[0]) with torch.no_grad(): - I2 = self.language_id.mods["embedding_model"](*sampleInput) + I1 = self.language_id.mods["compute_features"](sampleWavs) + I2 = self.language_id.mods["mean_var_norm"](I1, sampleWavLens) + I3 = self.language_id.mods["embedding_model"](I2, sampleWavLens) + if bf16: with torch.cpu.amp.autocast(): - self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000))) - self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput) - self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput) - self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2) + self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs) + self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens)) + self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens)) + self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3) self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"]) self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"]) self.language_id.mods["embedding_model"] = torch.jit.freeze(self.language_id.mods["embedding_model"]) self.language_id.mods["classifier"] = torch.jit.freeze( self.language_id.mods["classifier"]) else: - self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000))) - self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput) - self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput) - self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2) + self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs) + self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens)) + self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens)) + self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3) self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"]) self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"]) @@ -114,11 +118,11 @@ def predict(self, data_path="", ipex_op=False, bf16=False, int8_model=False, ver with torch.no_grad(): if bf16: with torch.cpu.amp.autocast(): - prediction = self.language_id.classify_batch(signal) + prediction = self.language_id.classify_batch(signal) else: - prediction = self.language_id.classify_batch(signal) + prediction = self.language_id.classify_batch(signal) else: # default - prediction = self.language_id.classify_batch(signal) + prediction = self.language_id.classify_batch(signal) inference_end_time = time() inference_latency = inference_end_time - inference_start_time @@ -195,13 +199,13 @@ def main(argv): with open(OUTPUT_SUMMARY_CSV_FILE, 'w') as f: writer = csv.writer(f) writer.writerow(["Audio File", - "Input Frequency", + "Input Frequency (Hz)", "Expected Language", "Top Consensus", "Top Consensus %", "Second Consensus", "Second Consensus %", - "Average Latency", + "Average Latency (s)", "Result"]) total_samples = 0 @@ -273,12 +277,12 @@ def main(argv): predict_list = [] use_entire_audio_file = False latency_sum = 0.0 - if data.waveduration < sample_dur: + if int(data.waveduration) <= sample_dur: # Use entire audio file if the duration is less than the sampling duration use_entire_audio_file = True sample_list = [0 for _ in range(sample_size)] else: - start_time_list = list(range(sample_size - int(data.waveduration) + 1)) + start_time_list = list(range(int(data.waveduration) - sample_dur)) sample_list = [] for i in range(sample_size): sample_list.append(random.sample(start_time_list, 1)[0]) @@ -346,17 +350,36 @@ def main(argv): avg_latency, result ]) + else: + # Write results to a .csv file + with open(OUTPUT_SUMMARY_CSV_FILE, 'a') as f: + writer = csv.writer(f) + writer.writerow([ + filename, + sample_rate_for_csv, + "N/A", + top_occurance, + str(topPercentage) + "%", + sec_occurance, + str(secPercentage) + "%", + avg_latency, + "N/A" + ]) + if ground_truth_compare: # Summary of results print("\n\n Correctly predicted %d/%d\n" %(correct_predictions, total_samples)) - print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE)) + + print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE)) elif os.path.isfile(path): print("\nIt is a normal file", path) else: print("It is a special file (socket, FIFO, device file)" , path) + print("Done.\n") + if __name__ == "__main__": import sys sys.exit(main(sys.argv)) diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/initialize.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/initialize.sh deleted file mode 100644 index 935debac44..0000000000 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/initialize.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -# Activate the oneAPI environment for PyTorch -source activate pytorch - -# Install speechbrain -git clone https://github.com/speechbrain/speechbrain.git -cd speechbrain -pip install -r requirements.txt -pip install --editable . -cd .. - -# Add speechbrain to environment variable PYTHONPATH -export PYTHONPATH=$PYTHONPATH:/Inference/speechbrain - -# Install PyTorch and Intel Extension for PyTorch (IPEX) -pip install torch==1.13.1 torchaudio -pip install --no-deps torchvision==0.14.0 -pip install intel_extension_for_pytorch==1.13.100 -pip install neural-compressor==2.0 - -# Update packages -apt-get update && apt-get install libgl1 diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/interfaces.patch b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/interfaces.patch deleted file mode 100644 index 762ae5ebee..0000000000 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/interfaces.patch +++ /dev/null @@ -1,11 +0,0 @@ ---- interfaces.py 2022-10-07 16:58:26.836359346 -0700 -+++ interfaces_new.py 2022-10-07 16:59:09.968110128 -0700 -@@ -945,7 +945,7 @@ - out_prob = self.mods.classifier(emb).squeeze(1) - score, index = torch.max(out_prob, dim=-1) - text_lab = self.hparams.label_encoder.decode_torch(index) -- return out_prob, score, index, text_lab -+ return out_prob, score, index # removed text_lab to get torchscript to work - - def classify_file(self, path): - """Classifies the given audiofile into the given set of labels. diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/lang_id_inference.ipynb b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/lang_id_inference.ipynb index 0ed44139b3..1cd1afee01 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/lang_id_inference.ipynb +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/lang_id_inference.ipynb @@ -47,7 +47,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python inference_commonVoice.py -p /data/commonVoice/test" + "!python inference_commonVoice.py -p ${COMMON_VOICE_PATH}/processed_data/test" ] }, { @@ -55,7 +55,7 @@ "metadata": {}, "source": [ "## inference_custom.py for Custom Data \n", - "To generate an overall results output summary, the audio_ground_truth_labels.csv file needs to be modified with the name of the audio file and expected audio label (i.e. en for English). By default, this is disabled but if desired, the *--ground_truth_compare* can be used. To run inference on custom data, you must specify a folder with WAV files and pass the path in as an argument. " + "To run inference on custom data, you must specify a folder with .wav files and pass the path in as an argument. You can do so by creating a folder named `data_custom` and then copy 1 or 2 .wav files from your test dataset into it. .mp3 files will NOT work. " ] }, { @@ -65,7 +65,7 @@ "### Randomly select audio clips from audio files for prediction\n", "python inference_custom.py -p DATAPATH -d DURATION -s SIZE\n", "\n", - "An output file output_summary.csv will give the summary of the results." + "An output file `output_summary.csv` will give the summary of the results." ] }, { @@ -104,6 +104,8 @@ "### Optimizations with Intel® Extension for PyTorch (IPEX) \n", "python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose \n", "\n", + "This will apply ipex.optimize to the model(s) and TorchScript. You can also add the --bf16 option along with --ipex to run in the BF16 data type, supported on 4th Gen Intel® Xeon® Scalable processors and newer.\n", + "\n", "Note that the *--verbose* option is required to view the latency measurements. " ] }, @@ -121,7 +123,7 @@ "metadata": {}, "source": [ "## Quantization with Intel® Neural Compressor (INC)\n", - "To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to */data/commonVoice/dev* which was generated from the data preprocessing scripts in the *Training* folder. " + "To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to `$COMMON_VOICE_PATH/processed_data/dev` which was generated from the data preprocessing scripts in the `Training` folder. " ] }, { @@ -130,14 +132,46 @@ "metadata": {}, "outputs": [], "source": [ - "!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/dev" + "!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/processed_data/dev" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After quantization, the model will be stored in lang_id_commonvoice_model_INT8 and neural_compressor.utils.pytorch.load will have to be used to load the quantized model for inference. If self.language_id is the original model and data_path is the path to the audio file:\n", + "\n", + "```\n", + "from neural_compressor.utils.pytorch import load\n", + "model_int8 = load(\"./lang_id_commonvoice_model_INT8\", self.language_id)\n", + "signal = self.language_id.load_audio(data_path)\n", + "prediction = self.model_int8(signal)\n", + "```" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After quantization, the model will be stored in *lang_id_commonvoice_model_INT8* and *neural_compressor.utils.pytorch.load* will have to be used to load the quantized model for inference. " + "The code above is integrated into inference_custom.py. You can now run inference on your data using this INT8 model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!python inference_custom.py -p data_custom -d 3 -s 50 --vad --int8_model --verbose" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### (Optional) Comparing Predictions with Ground Truth\n", + "\n", + "You can choose to modify audio_ground_truth_labels.csv to include the name of the audio file and expected audio label (like, en for English), then run inference_custom.py with the --ground_truth_compare option. By default, this is disabled." ] }, { diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/quantize_model.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/quantize_model.py index 428e24142e..e5ce7f9bbc 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/quantize_model.py +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/quantize_model.py @@ -18,8 +18,6 @@ from neural_compressor.utils.pytorch import load from speechbrain.pretrained import EncoderClassifier -DEFAULT_EVAL_DATA_PATH = "/data/commonVoice/dev" - def prepare_dataset(path): data_list = [] for dir_name in os.listdir(path): @@ -33,7 +31,7 @@ def main(argv): import argparse parser = argparse.ArgumentParser() parser.add_argument('-p', type=str, required=True, help="Path to the model to be optimized") - parser.add_argument('-datapath', type=str, default=DEFAULT_EVAL_DATA_PATH, help="Path to evaluation dataset") + parser.add_argument('-datapath', type=str, required=True, help="Path to evaluation dataset") args = parser.parse_args() model_path = args.p diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_input_features.pt b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_input_features.pt deleted file mode 100644 index 61114fe706..0000000000 Binary files a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_input_features.pt and /dev/null differ diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_wavs.pt b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_wavs.pt new file mode 100644 index 0000000000..72ea7cc659 Binary files /dev/null and b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Inference/sample_wavs.pt differ diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md index a44d562f57..623a82d85b 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/README.md @@ -6,7 +6,7 @@ Languages are selected from the CommonVoice dataset for training, validation, an | Area | Description |:--- |:--- -| What you will learn | How to use training and inference with SpeechBrain, Intel® Extension for PyTorch (IPEX) inference, Intel® Neural Compressor (INC) quantization, and a oneapi-aikit container +| What you will learn | How to use training and inference with SpeechBrain, Intel® Extension for PyTorch* (IPEX) inference, Intel® Neural Compressor (INC) quantization | Time to complete | 60 minutes ## Purpose @@ -17,16 +17,14 @@ Spoken audio comes in different languages and this sample uses a model to identi | Optimized for | Description |:--- |:--- -| OS | Ubuntu* 18.04 or newer -| Hardware | Intel® Xeon® processor family -| Software | Intel® OneAPI AI Analytics Toolkit
Hugging Face SpeechBrain +| OS | Ubuntu* 22.04 or newer +| Hardware | Intel® Xeon® and Core® processor families +| Software | Intel® AI Tools
Hugging Face SpeechBrain ## Key Implementation Details The [CommonVoice](https://commonvoice.mozilla.org/) dataset is used to train an Emphasized Channel Attention, Propagation and Aggregation Time Delay Neural Network (ECAPA-TDNN). This is implemented in the [Hugging Face SpeechBrain](https://huggingface.co/SpeechBrain) library. Additionally, a small Convolutional Recurrent Deep Neural Network (CRDNN) pretrained on the LibriParty dataset is used to process audio samples and output the segments where speech activity is detected. -After you have downloaded the CommonVoice dataset, the data must be preprocessed by converting the MP3 files into WAV format and separated into training, validation, and testing sets. - The model is then trained from scratch using the Hugging Face SpeechBrain library. This model is then used for inference on the testing dataset or a user-specified dataset. There is an option to utilize SpeechBrain's Voice Activity Detection (VAD) where only the speech segments from the audio files are extracted and combined before samples are randomly selected as input into the model. To improve performance, the user may quantize the trained model to INT8 using Intel® Neural Compressor (INC) to decrease latency. The sample contains three discreet phases: @@ -39,93 +37,94 @@ For both training and inference, you can run the sample and scripts in Jupyter N ## Prepare the Environment -### Downloading the CommonVoice Dataset +### Create and Set Up Environment ->**Note**: You can skip downloading the dataset if you already have a pretrained model and only want to run inference on custom data samples that you provide. +1. Create your conda environment by following the instructions on the Intel [AI Tools Selector](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-tools-selector.html). You can follow these settings: -Download the CommonVoice dataset for languages of interest from [https://commonvoice.mozilla.org/en/datasets](https://commonvoice.mozilla.org/en/datasets). +* Tool: AI Tools +* Preset or customize: Customize +* Distribution Type: conda* or pip +* Python Versions: Python* 3.9 or 3.10 +* PyTorch* Framework Optimizations: Intel® Extension for PyTorch* (CPU) +* Intel®-Optimized Tools & Libraries: Intel® Neural Compressor -For this sample, you will need to download the following languages: **Japanese** and **Swedish**. Follow Steps 1-6 below or you can execute the code. +>**Note**: Be sure to activate your environment before installing the packages. If using pip, install using `python -m pip` instead of just `pip`. + +2. Create your dataset folder and set the environment variable `COMMON_VOICE_PATH`. This needs to match with where you downloaded your dataset. +```bash +mkdir -p /data/commonVoice +export COMMON_VOICE_PATH=/data/commonVoice +``` -1. On the CommonVoice website, select the Version and Language. -2. Enter your email. -3. Check the boxes, and right-click on the download button to copy the link address. -4. Paste this link into a text editor and copy the first part of the URL up to ".tar.gz". -5. Use **GNU wget** on the URL to download the data to `/data/commonVoice`. +3. Install packages needed for MP3 to WAV conversion +```bash +sudo apt-get update && apt-get install -y ffmpeg libgl1 +``` - Alternatively, you can use a directory on your local drive (due to the large amount of data). If you opt to do so, you must change the `COMMON_VOICE_PATH` environment in `launch_docker.sh` before running the script. +4. Navigate to your working directory, clone the `oneapi-src` repository, and navigate to this code sample. +```bash +git clone https://github.com/oneapi-src/oneAPI-samples.git +cd oneAPI-samples/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification +``` -6. Extract the compressed folder, and rename the folder with the language (for example, English). +5. Run the bash script to install additional necessary libraries, including SpeechBrain. +```bash +source initialize.sh +``` - The file structure **must match** the `LANGUAGE_PATHS` defined in `prepareAllCommonVoice.py` in the `Training` folder for the script to run properly. +### Download the CommonVoice Dataset -These commands illustrate Steps 1-6. Notice that it downloads Japanese and Swedish from CommonVoice version 11.0. +>**Note**: You can skip downloading the dataset if you already have a pretrained model and only want to run inference on custom data samples that you provide. + +First, change to the `Dataset` directory. ``` -# Create the commonVoice directory under 'data' -sudo chmod 777 -R /data -cd /data -mkdir commonVoice -cd commonVoice - -# Download the CommonVoice data -wget \ -https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-11.0-2022-09-21/cv-corpus-11.0-2022-09-21-ja.tar.gz \ -https://mozilla-common-voice-datasets.s3.dualstack.us-west-2.amazonaws.com/cv-corpus-11.0-2022-09-21/cv-corpus-11.0-2022-09-21-sv-SE.tar.gz - -# Extract and organize the CommonVoice data into respective folders by language -tar -xf cv-corpus-11.0-2022-09-21-ja.tar.gz -mv cv-corpus-11.0-2022-09-21 japanese -tar -xf cv-corpus-11.0-2022-09-21-sv-SE.tar.gz -mv cv-corpus-11.0-2022-09-21 swedish +cd ./Dataset ``` -### Configuring the Container +The `get_dataset.py` script downloads the Common Voice dataset by doing the following: -1. Pull the `oneapi-aikit` docker image. -2. Set up the Docker environment. - ``` - docker pull intel/oneapi-aikit - ./launch_docker.sh - ``` - >**Note**: By default, the `Inference` and `Training` directories will be mounted and the environment variable `COMMON_VOICE_PATH` will be set to `/data/commonVoice` and mounted to `/data`. `COMMON_VOICE_PATH` is the location of where the CommonVoice dataset is downloaded. +- Gets the train set of the [Common Voice dataset from Huggingface](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) for Japanese and Swedish +- Downloads each mp3 and moves them to the `output_dir` folder +1. If you want to add additional languages, then modify the `language_to_code` dictionary in the file to reflect the languages to be included in the model. +3. Run the script with options. + ```bash + python get_dataset.py --output_dir ${COMMON_VOICE_PATH} + ``` + | Parameters | Description + |:--- |:--- + | `--output_dir` | Base output directory for saving the files. Default is /data/commonVoice + +Once the dataset is downloaded, navigate back to the parent directory +``` +cd .. +``` ## Train the Model with Languages This section explains how to train a model for language identification using the CommonVoice dataset, so it includes steps on how to preprocess the data, train the model, and prepare the output files for inference. -### Configure the Training Environment - -1. Change to the `Training` directory. - ``` - cd /Training - ``` -2. Source the bash script to install the necessary components. - ``` - source initialize.sh - ``` - This installs PyTorch*, the Intel® Extension for PyTorch (IPEX), and other components. +First, change to the `Training` directory. +``` +cd ./Training +``` -### Run in Jupyter Notebook +### Option 1: Run in Jupyter Notebook -1. Install Jupyter Notebook. - ``` - pip install notebook - ``` -2. Launch Jupyter Notebook. +1. Launch Jupyter Notebook. ``` jupyter notebook --ip 0.0.0.0 --port 8888 --allow-root ``` -3. Follow the instructions to open the URL with the token in your browser. -4. Locate and select the Training Notebook. +2. Follow the instructions to open the URL with the token in your browser. +3. Locate and select the Training Notebook. ``` lang_id_training.ipynb ``` -5. Follow the instructions in the Notebook. +4. Follow the instructions in the Notebook. -### Run in a Console +### Option 2: Run in a Console If you cannot or do not want to use Jupyter Notebook, use these procedures to run the sample and scripts locally. @@ -133,13 +132,13 @@ If you cannot or do not want to use Jupyter Notebook, use these procedures to ru 1. Acquire copies of the training scripts. (The command retrieves copies of the required VoxLingua107 training scripts from SpeechBrain.) ``` - cp speechbrain/recipes/VoxLingua107/lang_id/create_wds_shards.py create_wds_shards.py - cp speechbrain/recipes/VoxLingua107/lang_id/train.py train.py - cp speechbrain/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml train_ecapa.yaml + cp ../speechbrain/recipes/VoxLingua107/lang_id/create_wds_shards.py create_wds_shards.py + cp ../speechbrain/recipes/VoxLingua107/lang_id/train.py train.py + cp ../speechbrain/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml train_ecapa.yaml ``` 2. From the `Training` directory, apply patches to modify these files to work with the CommonVoice dataset. - ``` + ```bash patch < create_wds_shards.patch patch < train_ecapa.patch ``` @@ -154,8 +153,8 @@ The `prepareAllCommonVoice.py` script performs the following data preprocessing 1. If you want to add additional languages, then modify the `LANGUAGE_PATHS` list in the file to reflect the languages to be included in the model. 2. Run the script with options. The samples will be divided as follows: 80% training, 10% validation, 10% testing. - ``` - python prepareAllCommonVoice.py -path /data -max_samples 2000 --createCsv --train --dev --test + ```bash + python prepareAllCommonVoice.py -path $COMMON_VOICE_PATH -max_samples 2000 --createCsv --train --dev --test ``` | Parameters | Description |:--- |:--- @@ -166,27 +165,28 @@ The `prepareAllCommonVoice.py` script performs the following data preprocessing #### Create Shards for Training and Validation -1. If the `/data/commonVoice_shards` folder exists, delete the folder and the contents before proceeding. +1. If the `${COMMON_VOICE_PATH}/processed_data/commonVoice_shards` folder exists, delete the folder and the contents before proceeding. 2. Enter the following commands. + ```bash + python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/train ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/train + python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/dev ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/dev ``` - python create_wds_shards.py /data/commonVoice/train/ /data/commonVoice_shards/train - python create_wds_shards.py /data/commonVoice/dev/ /data/commonVoice_shards/dev - ``` -3. Note the shard with the largest number as `LARGEST_SHARD_NUMBER` in the output above or by navigating to `/data/commonVoice_shards/train`. +3. Note the shard with the largest number as `LARGEST_SHARD_NUMBER` in the output above or by navigating to `${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/train`. 4. Open the `train_ecapa.yaml` file and modify the `train_shards` variable to make the range reflect: `000000..LARGEST_SHARD_NUMBER`. -5. Repeat the process for `/data/commonVoice_shards/dev`. +5. Repeat Steps 3 and 4 for `${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/dev`. #### Run the Training Script -The YAML file `train_ecapa.yaml` with the training configurations should already be patched from the Prerequisite section. +The YAML file `train_ecapa.yaml` with the training configurations is passed as an argument to the `train.py` script to train the model. 1. If necessary, edit the `train_ecapa.yaml` file to meet your needs. | Parameters | Description |:--- |:--- + | `seed` | The seed value, which should be set to a different value for subsequent runs. Defaults to 1987. | `out_n_neurons` | Must be equal to the number of languages of interest. | `number_of_epochs` | Default is **10**. Adjust as needed. - | `batch_size` | In the trainloader_options, decrease this value if your CPU or GPU runs out of memory while running the training script. + | `batch_size` | In the trainloader_options, decrease this value if your CPU or GPU runs out of memory while running the training script. If you see a "Killed" error message, then the training script has run out of memory. 2. Run the script to train the model. ``` @@ -195,30 +195,48 @@ The YAML file `train_ecapa.yaml` with the training configurations should already #### Move Model to Inference Folder -After training, the output should be inside `results/epaca/SEED_VALUE` folder. By default SEED_VALUE is set to 1987 in the YAML file. You can change the value as needed. +After training, the output should be inside the `results/epaca/1987` folder. By default the `seed` is set to 1987 in `train_ecapa.yaml`. You can change the value as needed. -1. Copy all files with *cp -R* from `results/epaca/SEED_VALUE` into a new folder called `lang_id_commonvoice_model` in the **Inference** folder. - - The name of the folder MUST match with the pretrained_path variable defined in the YAML file. By default, it is `lang_id_commonvoice_model`. +1. Copy all files from `results/epaca/1987` into a new folder called `lang_id_commonvoice_model` in the **Inference** folder. + ```bash + cp -R results/epaca/1987 ../Inference/lang_id_commonvoice_model + ``` + The name of the folder MUST match with the pretrained_path variable defined in `train_ecapa.yaml`. By default, it is `lang_id_commonvoice_model`. 2. Change directory to `/Inference/lang_id_commonvoice_model/save`. + ```bash + cd ../Inference/lang_id_commonvoice_model/save + ``` + 3. Copy the `label_encoder.txt` file up one level. -4. Change to the latest `CKPT` folder, and copy the classifier.ckpt and embedding_model.ckpt files into the `/Inference/lang_id_commonvoice_model/` folder. + ```bash + cp label_encoder.txt ../. + ``` + +4. Change to the latest `CKPT` folder, and copy the classifier.ckpt and embedding_model.ckpt files into the `/Inference/lang_id_commonvoice_model/` folder which is two directories up. By default, the command below will navigate into the single CKPT folder that is present, but you can change it to the specific folder name. + ```bash + # Navigate into the CKPT folder + cd CKPT* + + cp classifier.ckpt ../../. + cp embedding_model.ckpt ../../ + cd ../../../.. + ``` - You may need to modify the permissions of these files to be executable before you run the inference scripts to consume them. + You may need to modify the permissions of these files to be executable i.e. `sudo chmod 755` before you run the inference scripts to consume them. >**Note**: If `train.py` is rerun with the same seed, it will resume from the epoch number it last run. For a clean rerun, delete the `results` folder or change the seed. You can now load the model for inference. In the `Inference` folder, the `inference_commonVoice.py` script uses the trained model on the testing dataset, whereas `inference_custom.py` uses the trained model on a user-specified dataset and can utilize Voice Activity Detection. ->**Note**: If the folder name containing the model is changed from `lang_id_commonvoice_model`, you will need to modify the `source_model_path` variable in `inference_commonVoice.py` and `inference_custom.py` files in the `speechbrain_inference` class. +>**Note**: If the folder name containing the model is changed from `lang_id_commonvoice_model`, you will need to modify the `pretrained_path` in `train_ecapa.yaml`, and the `source_model_path` variable in both the `inference_commonVoice.py` and `inference_custom.py` files in the `speechbrain_inference` class. ## Run Inference for Language Identification >**Stop**: If you have not already done so, you must run the scripts in the `Training` folder to generate the trained model before proceeding. -To run inference, you must have already run all of the training scripts, generated the trained model, and moved files to the appropriate locations. You must place the model output in a folder name matching the name specified as the `pretrained_path` variable defined in the YAML file. +To run inference, you must have already run all of the training scripts, generated the trained model, and moved files to the appropriate locations. You must place the model output in a folder name matching the name specified as the `pretrained_path` variable defined in `train_ecapa.yaml`. >**Note**: If you plan to run inference on **custom data**, you will need to create a folder for the **.wav** files to be used for prediction. For example, `data_custom`. Move the **.wav** files to your custom folder. (For quick results, you may select a few audio files from each language downloaded from CommonVoice.) @@ -226,35 +244,23 @@ To run inference, you must have already run all of the training scripts, generat 1. Change to the `Inference` directory. ``` - cd /Inference - ``` -2. Source the bash script to install or update the necessary components. - ``` - source initialize.sh - ``` -3. Patch the Intel® Extension for PyTorch (IPEX) to use SpeechBrain models. (This patch is required for PyTorch* TorchScript to work because the output of the model must contain only tensors.) - ``` - patch ./speechbrain/speechbrain/pretrained/interfaces.py < interfaces.patch + cd ./Inference ``` -### Run in Jupyter Notebook +### Option 1: Run in Jupyter Notebook -1. If you have not already done so, install Jupyter Notebook. - ``` - pip install notebook - ``` -2. Launch Jupyter Notebook. +1. Launch Jupyter Notebook. ``` - jupyter notebook --ip 0.0.0.0 --port 8888 --allow-root + jupyter notebook --ip 0.0.0.0 --port 8889 --allow-root ``` -3. Follow the instructions to open the URL with the token in your browser. -4. Locate and select the inference Notebook. +2. Follow the instructions to open the URL with the token in your browser. +3. Locate and select the inference Notebook. ``` lang_id_inference.ipynb ``` -5. Follow the instructions in the Notebook. +4. Follow the instructions in the Notebook. -### Run in a Console +### Option 2: Run in a Console If you cannot or do not want to use Jupyter Notebook, use these procedures to run the sample and scripts locally. @@ -287,34 +293,32 @@ Both scripts support input options; however, some options can be use on `inferen #### On the CommonVoice Dataset 1. Run the inference_commonvoice.py script. - ``` - python inference_commonVoice.py -p /data/commonVoice/test + ```bash + python inference_commonVoice.py -p ${COMMON_VOICE_PATH}/processed_data/test ``` The script should create a `test_data_accuracy.csv` file that summarizes the results. #### On Custom Data -1. Modify the `audio_ground_truth_labels.csv` file to include the name of the audio file and expected audio label (like, `en` for English). +To run inference on custom data, you must specify a folder with **.wav** files and pass the path in as an argument. You can do so by creating a folder named `data_custom` and then copy 1 or 2 **.wav** files from your test dataset into it. **.mp3** files will NOT work. - By default, this is disabled. If required, use the `--ground_truth_compare` input option. To run inference on custom data, you must specify a folder with **.wav** files and pass the path in as an argument. - -2. Run the inference_ script. - ``` - python inference_custom.py -p - ``` +Run the inference_ script. +```bash +python inference_custom.py -p +``` The following examples describe how to use the scripts to produce specific outcomes. **Default: Random Selections** 1. To randomly select audio clips from audio files for prediction, enter commands similar to the following: - ``` + ```bash python inference_custom.py -p data_custom -d 3 -s 50 ``` This picks 50 3-second samples from each **.wav** file in the `data_custom` folder. The `output_summary.csv` file summarizes the results. 2. To randomly select audio clips from audio files after applying **Voice Activity Detection (VAD)**, use the `--vad` option: - ``` + ```bash python inference_custom.py -p data_custom -d 3 -s 50 --vad ``` Again, the `output_summary.csv` file summarizes the results. @@ -324,18 +328,20 @@ The following examples describe how to use the scripts to produce specific outco **Optimization with Intel® Extension for PyTorch (IPEX)** 1. To optimize user-defined data, enter commands similar to the following: - ``` + ```bash python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose ``` + This will apply `ipex.optimize` to the model(s) and TorchScript. You can also add the `--bf16` option along with `--ipex` to run in the BF16 data type, supported on 4th Gen Intel® Xeon® Scalable processors and newer. + >**Note**: The `--verbose` option is required to view the latency measurements. **Quantization with Intel® Neural Compressor (INC)** 1. To improve inference latency, you can use the Intel® Neural Compressor (INC) to quantize the trained model from FP32 to INT8 by running `quantize_model.py`. + ```bash + python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/processed_data/dev ``` - python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/dev - ``` - Use the `-datapath` argument to specify a custom evaluation dataset. By default, the datapath is set to the `/data/commonVoice/dev` folder that was generated from the data preprocessing scripts in the `Training` folder. + Use the `-datapath` argument to specify a custom evaluation dataset. By default, the datapath is set to the `$COMMON_VOICE_PATH/processed_data/dev` folder that was generated from the data preprocessing scripts in the `Training` folder. After quantization, the model will be stored in `lang_id_commonvoice_model_INT8` and `neural_compressor.utils.pytorch.load` will have to be used to load the quantized model for inference. If `self.language_id` is the original model and `data_path` is the path to the audio file: ``` @@ -345,9 +351,16 @@ The following examples describe how to use the scripts to produce specific outco prediction = self.model_int8(signal) ``` -### Troubleshooting + The code above is integrated into `inference_custom.py`. You can now run inference on your data using this INT8 model: + ```bash + python inference_custom.py -p data_custom -d 3 -s 50 --vad --int8_model --verbose + ``` + + >**Note**: The `--verbose` option is required to view the latency measurements. + +**(Optional) Comparing Predictions with Ground Truth** -If the model appears to be giving the same output regardless of input, try running `clean.sh` to remove the `RIR_NOISES` and `speechbrain` folders. Redownload that data after cleaning by running `initialize.sh` and either `inference_commonVoice.py` or `inference_custom.py`. +You can choose to modify `audio_ground_truth_labels.csv` to include the name of the audio file and expected audio label (like, `en` for English), then run `inference_custom.py` with the `--ground_truth_compare` option. By default, this is disabled. ## License diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/clean.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/clean.sh index f60b245773..30f1806c10 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/clean.sh +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/clean.sh @@ -1,5 +1,4 @@ #!/bin/bash -rm -R RIRS_NOISES -rm -R speechbrain -rm -f rirs_noises.zip noise.csv reverb.csv +echo "Deleting rir, noise, speechbrain" +rm -R rir noise diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/create_wds_shards.patch b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/create_wds_shards.patch index ddfe37588b..3d60bc627f 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/create_wds_shards.patch +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/create_wds_shards.patch @@ -1,5 +1,5 @@ ---- create_wds_shards.py 2022-09-20 14:55:48.732386718 -0700 -+++ create_wds_shards_commonvoice.py 2022-09-20 14:53:56.554637629 -0700 +--- create_wds_shards.py 2024-11-13 18:08:07.440000000 -0800 ++++ create_wds_shards_modified.py 2024-11-14 14:09:36.225000000 -0800 @@ -27,7 +27,10 @@ t, sr = torchaudio.load(audio_file_path) @@ -12,7 +12,7 @@ return t -@@ -61,27 +64,20 @@ +@@ -66,27 +69,22 @@ sample_keys_per_language = defaultdict(list) for f in audio_files: @@ -23,7 +23,9 @@ - f.as_posix(), - ) + # Common Voice format -+ # commonVoice_folder_path/common_voice__00000000.wav' ++ # commonVoice_folder_path/processed_data//common_voice__00000000.wav' ++ # DATASET_TYPE: dev, test, train ++ # LANG_ID: the label for the language + m = re.match(r"((.*)(common_voice_)(.+)(_)(\d+).wav)", f.as_posix()) + if m: diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/initialize.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/initialize.sh deleted file mode 100644 index 78c114f2dc..0000000000 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/initialize.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/bin/bash - -# Activate the oneAPI environment for PyTorch -source activate pytorch - -# Install speechbrain -git clone https://github.com/speechbrain/speechbrain.git -cd speechbrain -pip install -r requirements.txt -pip install --editable . -cd .. - -# Add speechbrain to environment variable PYTHONPATH -export PYTHONPATH=$PYTHONPATH:/Training/speechbrain - -# Install webdataset -pip install webdataset==0.1.96 - -# Install PyTorch and Intel Extension for PyTorch (IPEX) -pip install torch==1.13.1 torchaudio -pip install --no-deps torchvision==0.14.0 -pip install intel_extension_for_pytorch==1.13.100 - -# Install libraries for MP3 to WAV conversion -pip install pydub -apt-get update && apt-get install ffmpeg diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb index 0502d223e9..4550b88916 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/lang_id_training.ipynb @@ -29,9 +29,9 @@ "metadata": {}, "outputs": [], "source": [ - "!cp speechbrain/recipes/VoxLingua107/lang_id/create_wds_shards.py create_wds_shards.py\n", - "!cp speechbrain/recipes/VoxLingua107/lang_id/train.py train.py\n", - "!cp speechbrain/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml train_ecapa.yaml" + "!cp ../speechbrain/recipes/VoxLingua107/lang_id/create_wds_shards.py create_wds_shards.py\n", + "!cp ../speechbrain/recipes/VoxLingua107/lang_id/train.py train.py\n", + "!cp ../speechbrain/recipes/VoxLingua107/lang_id/hparams/train_ecapa.yaml train_ecapa.yaml" ] }, { @@ -75,7 +75,7 @@ "metadata": {}, "outputs": [], "source": [ - "!python prepareAllCommonVoice.py -path /data -max_samples 2000 --createCsv --train --dev --test" + "!python prepareAllCommonVoice.py -path $COMMON_VOICE_PATH -max_samples 2000 --createCsv --train --dev --test" ] }, { @@ -102,15 +102,15 @@ "metadata": {}, "outputs": [], "source": [ - "!python create_wds_shards.py /data/commonVoice/train/ /data/commonVoice_shards/train \n", - "!python create_wds_shards.py /data/commonVoice/dev/ /data/commonVoice_shards/dev" + "!python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/train ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/train \n", + "!python create_wds_shards.py ${COMMON_VOICE_PATH}/processed_data/dev ${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/dev" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Note down the shard with the largest number as LARGEST_SHARD_NUMBER in the output above or by navigating to */data/commonVoice_shards/train*. In *train_ecapa.yaml*, modify the *train_shards* variable to go from 000000..LARGEST_SHARD_NUMBER. Repeat the process for */data/commonVoice_shards/dev*. " + "Note down the shard with the largest number as LARGEST_SHARD_NUMBER in the output above or by navigating to `${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/train`. In `train_ecapa.yaml`, modify the `train_shards` variable to go from 000000..LARGEST_SHARD_NUMBER. Repeat the process for `${COMMON_VOICE_PATH}/processed_data/commonVoice_shards/dev`. " ] }, { @@ -126,6 +126,7 @@ "source": [ "### Run the training script \n", "The YAML file *train_ecapa.yaml* with the training configurations should already be patched from the Prerequisite section. The following parameters can be adjusted in the file directly as needed: \n", + "* *seed* should be set to a different value for subsequent runs. Defaults to 1987\n", "* *out_n_neurons* must be equal to the number of languages of interest \n", "* *number_of_epochs* is set to 10 by default but can be adjusted \n", "* In the trainloader_options, the *batch_size* may need to be decreased if your CPU or GPU runs out of memory while running the training script. \n", @@ -147,18 +148,57 @@ "metadata": {}, "source": [ "### Move output model to Inference folder \n", - "After training, the output should be inside results/epaca/SEED_VALUE. By default SEED_VALUE is set to 1987 in the YAML file. This value can be changed. Follow these instructions next: \n", + "After training, the output should be inside the `results/epaca/1987` folder. By default the `seed` is set to 1987 in `train_ecapa.yaml`. You can change the value as needed.\n", "\n", - "1. Copy all files with *cp -R* from results/epaca/SEED_VALUE into a new folder called *lang_id_commonvoice_model* in the Inference folder. The name of the folder MUST match with the pretrained_path variable defined in the YAML file. By default, it is *lang_id_commonvoice_model*. \n", - "2. Navigate to /Inference/land_id_commonvoice_model/save. \n", - "3. Copy the label_encoder.txt file up one level. \n", - "4. Navigate into the latest CKPT folder and copy the classifier.ckpt and embedding_model.ckpt files into the /Inference/lang_id_commonvoice_model/ level. You may need to modify the permissions of these files to be executable before you run the inference scripts to consume them. \n", + "1. Copy all files from `results/epaca/1987` into a new folder called `lang_id_commonvoice_model` in the **Inference** folder.\n", + " The name of the folder MUST match with the pretrained_path variable defined in `train_ecapa.yaml`. By default, it is `lang_id_commonvoice_model`.\n", "\n", - "Note that if *train.py* is rerun with the same seed, it will resume from the epoch number it left off of. For a clean rerun, delete the *results* folder or change the seed. \n", + "2. Change directory to `/Inference/lang_id_commonvoice_model/save`.\n", "\n", - "### Running inference\n", - "At this point, the model can be loaded and used in inference. In the Inference folder, inference_commonVoice.py uses the trained model on \n", - "the testing dataset, whereas inference_custom.py uses the trained model on a user-specified dataset and utilizes Voice Activity Detection. Note that if the folder name containing the model is changed from *lang_id_commonvoice_model*, you will need to modify inference_commonVoice.py and inference_custom.py's *source_model_path* variable in the *speechbrain_inference* class. " + "3. Copy the `label_encoder.txt` file up one level.\n", + "\n", + "4. Change to the latest `CKPT` folder, and copy the classifier.ckpt and embedding_model.ckpt files into the `/Inference/lang_id_commonvoice_model/` folder which is two directories up." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "# 1)\n", + "!cp -R results/epaca/1987 ../Inference/lang_id_commonvoice_model\n", + "\n", + "# 2)\n", + "os.chdir(\"../Inference/lang_id_commonvoice_model/save\")\n", + "\n", + "# 3)\n", + "!cp label_encoder.txt ../.\n", + "\n", + "# 4) \n", + "folders = os.listdir()\n", + "for folder in folders:\n", + " if \"CKPT\" in folder:\n", + " os.chdir(folder)\n", + " break\n", + "!cp classifier.ckpt ../../.\n", + "!cp embedding_model.ckpt ../../\n", + "os.chdir(\"../../../..\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You may need to modify the permissions of these files to be executable i.e. `sudo chmod 755` before you run the inference scripts to consume them.\n", + "\n", + ">**Note**: If `train.py` is rerun with the same seed, it will resume from the epoch number it last run. For a clean rerun, delete the `results` folder or change the seed.\n", + "\n", + "You can now load the model for inference. In the `Inference` folder, the `inference_commonVoice.py` script uses the trained model on the testing dataset, whereas `inference_custom.py` uses the trained model on a user-specified dataset and can utilize Voice Activity Detection. \n", + "\n", + ">**Note**: If the folder name containing the model is changed from `lang_id_commonvoice_model`, you will need to modify the `pretrained_path` in `train_ecapa.yaml`, and the `source_model_path` variable in both the `inference_commonVoice.py` and `inference_custom.py` files in the `speechbrain_inference` class. " ] } ], diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/prepareAllCommonVoice.py b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/prepareAllCommonVoice.py index ed78ab5c35..a6ab8df1b2 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/prepareAllCommonVoice.py +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/prepareAllCommonVoice.py @@ -124,9 +124,9 @@ def main(argv): createCsv = args.createCsv # Data paths - TRAIN_PATH = commonVoicePath + "/commonVoice/train" - TEST_PATH = commonVoicePath + "/commonVoice/test" - DEV_PATH = commonVoicePath + "/commonVoice/dev" + TRAIN_PATH = commonVoicePath + "/processed_data/train" + TEST_PATH = commonVoicePath + "/processed_data/test" + DEV_PATH = commonVoicePath + "/processed_data/dev" # Prepare the csv files for the Common Voice dataset if createCsv: diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/train_ecapa.patch b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/train_ecapa.patch index 38db22cf39..c95bf540ad 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/train_ecapa.patch +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/Training/train_ecapa.patch @@ -1,60 +1,55 @@ ---- train_ecapa.yaml.orig 2023-02-09 17:17:34.849537612 +0000 -+++ train_ecapa.yaml 2023-02-09 17:19:42.936542193 +0000 -@@ -4,19 +4,19 @@ +--- train_ecapa.yaml 2024-11-13 18:08:40.313000000 -0800 ++++ train_ecapa_modified.yaml 2024-11-14 14:52:31.232000000 -0800 +@@ -4,17 +4,17 @@ # ################################ # Basic parameters -seed: 1988 +seed: 1987 - __set_seed: !apply:torch.manual_seed [!ref ] + __set_seed: !apply:speechbrain.utils.seed_everything [!ref ] output_folder: !ref results/epaca/ save_folder: !ref /save train_log: !ref /train_log.txt -data_folder: !PLACEHOLDER +data_folder: ./ - rir_folder: !ref - # skip_prep: False -shards_url: /data/voxlingua107_shards -+shards_url: /data/commonVoice_shards ++shards_url: /data/commonVoice/processed_data/commonVoice_shards train_meta: !ref /train/meta.json val_meta: !ref /dev/meta.json -train_shards: !ref /train/shard-{000000..000507}.tar +train_shards: !ref /train/shard-{000000..000000}.tar val_shards: !ref /dev/shard-000000.tar - # Set to directory on a large disk if you are training on Webdataset shards hosted on the web -@@ -25,7 +25,7 @@ + # Data for augmentation +@@ -32,7 +32,7 @@ ckpt_interval_minutes: 5 # Training parameters -number_of_epochs: 40 -+number_of_epochs: 10 ++number_of_epochs: 3 lr: 0.001 lr_final: 0.0001 sample_rate: 16000 -@@ -38,11 +38,11 @@ +@@ -45,10 +45,10 @@ deltas: False # Number of languages -out_n_neurons: 107 +out_n_neurons: 2 +-num_workers: 4 +-batch_size: 128 ++num_workers: 1 ++batch_size: 64 + batch_size_val: 32 train_dataloader_options: -- num_workers: 4 -- batch_size: 128 -+ num_workers: 1 -+ batch_size: 64 + num_workers: !ref +@@ -60,6 +60,21 @@ - val_dataloader_options: - num_workers: 1 -@@ -138,3 +138,20 @@ - classifier: !ref - normalizer: !ref - counter: !ref -+ -+# Below most relevant for inference using self-trained model: -+ + ############################## Augmentations ################################### + ++# Changes for code sample to work with CommonVoice dataset +pretrained_path: lang_id_commonvoice_model + +label_encoder: !new:speechbrain.dataio.encoder.CategoricalEncoder @@ -69,3 +64,6 @@ + classifier: !ref /classifier.ckpt + label_encoder: !ref /label_encoder.txt + + # Download and prepare the dataset of noisy sequences for augmentation + prepare_noise_data: !name:speechbrain.augment.preparation.prepare_dataset_from_URL + URL: !ref diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/initialize.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/initialize.sh new file mode 100644 index 0000000000..0021b588b1 --- /dev/null +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/initialize.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# Install huggingface datasets and other requirements +conda install -c conda-forge -y datasets tqdm librosa jupyter ipykernel ipywidgets + +# Install speechbrain +git clone --depth 1 --branch v1.0.2 https://github.com/speechbrain/speechbrain.git +cd speechbrain +python -m pip install -r requirements.txt +python -m pip install --editable . +cd .. + +# Add speechbrain to environment variable PYTHONPATH +export PYTHONPATH=$PYTHONPATH:$(pwd)/speechbrain + +# Install webdataset +python -m pip install webdataset==0.2.100 + +# Install libraries for MP3 to WAV conversion +python -m pip install pydub + +# Install notebook to run Jupyter notebooks +python -m pip install notebook diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/launch_docker.sh b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/launch_docker.sh deleted file mode 100644 index 546523f6f6..0000000000 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/launch_docker.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/bash - -export COMMON_VOICE_PATH="/data/commonVoice" -export DOCKER_RUN_ENVS="-e ftp_proxy=${ftp_proxy} -e FTP_PROXY=${FTP_PROXY} -e http_proxy=${http_proxy} -e HTTP_PROXY=${HTTP_PROXY} -e https_proxy=${https_proxy} -e HTTPS_PROXY=${HTTPS_PROXY} -e no_proxy=${no_proxy} -e NO_PROXY=${NO_PROXY} -e socks_proxy=${socks_proxy} -e SOCKS_PROXY=${SOCKS_PROXY} -e COMMON_VOICE_PATH=${COMMON_VOICE_PATH}" -docker run --privileged ${DOCKER_RUN_ENVS} -it --rm --network host \ - -v"/home:/home" \ - -v"/tmp:/tmp" \ - -v "${PWD}/Inference":/Inference \ - -v "${PWD}/Training":/Training \ - -v "${COMMON_VOICE_PATH}":/data \ - --shm-size 32G \ - intel/oneapi-aikit - \ No newline at end of file diff --git a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json index ba157302ff..768ed8eb6d 100644 --- a/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json +++ b/AI-and-Analytics/End-to-end-Workloads/LanguageIdentification/sample.json @@ -12,8 +12,19 @@ { "id": "Language_Identification_E2E", "env": [ + "export COMMON_VOICE_PATH=/data/commonVoice" ], "steps": [ + "mkdir -p /data/commonVoice", + "apt-get update && apt-get install ffmpeg libgl1 -y", + "source initialize.sh", + "cd ./Dataset", + "python get_dataset.py --output_dir ${COMMON_VOICE_PATH}", + "cd ..", + "cd ./Training", + "jupyter nbconvert --execute --to notebook --inplace --debug lang_id_training.ipynb", + "cd ./Inference", + "jupyter nbconvert --execute --to notebook --inplace --debug lang_id_inference.ipynb" ] } ]