Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import os
import shutil
import argparse
from datasets import load_dataset
from tqdm import tqdm

language_to_code = {
"japanese": "ja",
"swedish": "sv-SE"
}

def download_dataset(output_dir):
for lang, lang_code in language_to_code.items():
print(f"Processing dataset for language: {lang_code}")

# Load the dataset for the specific language
dataset = load_dataset("mozilla-foundation/common_voice_11_0", lang_code, split="train", trust_remote_code=True)

# Create a language-specific output folder
output_folder = os.path.join(output_dir, lang, lang_code, "clips")
os.makedirs(output_folder, exist_ok=True)

# Extract and copy MP3 files
for sample in tqdm(dataset, desc=f"Extracting and copying MP3 files for {lang}"):
audio_path = sample['audio']['path']
shutil.copy(audio_path, output_folder)

print("Extraction and copy complete.")

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Extract and copy audio files from a dataset to a specified directory.")
parser.add_argument("--output_dir", type=str, default="/data/commonVoice", help="Base output directory for saving the files. Default is /data/commonVoice")
args = parser.parse_args()

download_dataset(args.output_dir)
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/bash

rm -R RIRS_NOISES
rm -R tmp
rm -R speechbrain
rm -f rirs_noises.zip noise.csv reverb.csv vad_file.txt
echo "Deleting .wav files, tmp"
rm -f ./*.wav
rm -R tmp
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self, dirpath, filename):
self.sampleRate = 0
self.waveData = ''
self.wavesize = 0
self.waveduriation = 0
self.waveduration = 0
if filename.endswith(".wav") or filename.endswith(".wmv"):
self.wavefile = filename
self.wavepath = dirpath + os.sep + filename
Expand Down Expand Up @@ -173,12 +173,12 @@ def main(argv):
data = datafile(testDataDirectory, filename)
predict_list = []
use_entire_audio_file = False
if data.waveduration < sample_dur:
if int(data.waveduration) <= sample_dur:
# Use entire audio file if the duration is less than the sampling duration
use_entire_audio_file = True
sample_list = [0 for _ in range(sample_size)]
else:
start_time_list = list(range(sample_size - int(data.waveduration) + 1))
start_time_list = list(range(0, int(data.waveduration) - sample_dur))
sample_list = []
for i in range(sample_size):
sample_list.append(random.sample(start_time_list, 1)[0])
Expand All @@ -198,10 +198,6 @@ def main(argv):
predict_list.append(' ')
pass

# Clean up
if use_entire_audio_file:
os.remove("./" + data.filename)

# Pick the top rated prediction result
occurence_count = Counter(predict_list)
total_count = sum(occurence_count.values())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, dirpath, filename):
self.sampleRate = 0
self.waveData = ''
self.wavesize = 0
self.waveduriation = 0
self.waveduration = 0
if filename.endswith(".wav") or filename.endswith(".wmv"):
self.wavefile = filename
self.wavepath = dirpath + os.sep + filename
Expand Down Expand Up @@ -61,41 +61,45 @@ def __init__(self, ipex_op=False, bf16=False, int8_model=False):
self.model_int8 = load(source_model_int8_path, self.language_id)
self.model_int8.eval()
elif ipex_op:
self.language_id.eval()

# Optimize for inference with IPEX
print("Optimizing inference with IPEX")
self.language_id.eval()
sampleInput = (torch.load("./sample_input_features.pt"), torch.load("./sample_input_wav_lens.pt"))
if bf16:
print("BF16 enabled")
self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"], dtype=torch.bfloat16)
self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"], dtype=torch.bfloat16)
self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"], dtype=torch.bfloat16)
self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"], dtype=torch.bfloat16)
else:
self.language_id.mods["compute_features"] = ipex.optimize(self.language_id.mods["compute_features"])
self.language_id.mods["mean_var_norm"] = ipex.optimize(self.language_id.mods["mean_var_norm"])
self.language_id.mods["embedding_model"] = ipex.optimize(self.language_id.mods["embedding_model"])
self.language_id.mods["classifier"] = ipex.optimize(self.language_id.mods["classifier"])

# Torchscript to resolve performance issues with reorder operations
print("Applying Torchscript")
sampleWavs = torch.load("./sample_wavs.pt")
sampleWavLens = torch.ones(sampleWavs.shape[0])
with torch.no_grad():
I2 = self.language_id.mods["embedding_model"](*sampleInput)
I1 = self.language_id.mods["compute_features"](sampleWavs)
I2 = self.language_id.mods["mean_var_norm"](I1, sampleWavLens)
I3 = self.language_id.mods["embedding_model"](I2, sampleWavLens)

if bf16:
with torch.cpu.amp.autocast():
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000)))
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput)
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput)
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2)
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs)
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens))
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens))
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3)

self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"])
self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"])
self.language_id.mods["embedding_model"] = torch.jit.freeze(self.language_id.mods["embedding_model"])
self.language_id.mods["classifier"] = torch.jit.freeze( self.language_id.mods["classifier"])
else:
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=(torch.rand(1,32000)))
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=sampleInput)
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=sampleInput)
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I2)
self.language_id.mods["compute_features"] = torch.jit.trace( self.language_id.mods["compute_features"] , example_inputs=sampleWavs)
self.language_id.mods["mean_var_norm"] = torch.jit.trace(self.language_id.mods["mean_var_norm"], example_inputs=(I1, sampleWavLens))
self.language_id.mods["embedding_model"] = torch.jit.trace(self.language_id.mods["embedding_model"], example_inputs=(I2, sampleWavLens))
self.language_id.mods["classifier"] = torch.jit.trace(self.language_id.mods["classifier"], example_inputs=I3)

self.language_id.mods["compute_features"] = torch.jit.freeze(self.language_id.mods["compute_features"])
self.language_id.mods["mean_var_norm"] = torch.jit.freeze(self.language_id.mods["mean_var_norm"])
Expand All @@ -114,11 +118,11 @@ def predict(self, data_path="", ipex_op=False, bf16=False, int8_model=False, ver
with torch.no_grad():
if bf16:
with torch.cpu.amp.autocast():
prediction = self.language_id.classify_batch(signal)
prediction = self.language_id.classify_batch(signal)
else:
prediction = self.language_id.classify_batch(signal)
prediction = self.language_id.classify_batch(signal)
else: # default
prediction = self.language_id.classify_batch(signal)
prediction = self.language_id.classify_batch(signal)

inference_end_time = time()
inference_latency = inference_end_time - inference_start_time
Expand Down Expand Up @@ -195,13 +199,13 @@ def main(argv):
with open(OUTPUT_SUMMARY_CSV_FILE, 'w') as f:
writer = csv.writer(f)
writer.writerow(["Audio File",
"Input Frequency",
"Input Frequency (Hz)",
"Expected Language",
"Top Consensus",
"Top Consensus %",
"Second Consensus",
"Second Consensus %",
"Average Latency",
"Average Latency (s)",
"Result"])

total_samples = 0
Expand Down Expand Up @@ -273,12 +277,12 @@ def main(argv):
predict_list = []
use_entire_audio_file = False
latency_sum = 0.0
if data.waveduration < sample_dur:
if int(data.waveduration) <= sample_dur:
# Use entire audio file if the duration is less than the sampling duration
use_entire_audio_file = True
sample_list = [0 for _ in range(sample_size)]
else:
start_time_list = list(range(sample_size - int(data.waveduration) + 1))
start_time_list = list(range(int(data.waveduration) - sample_dur))
sample_list = []
for i in range(sample_size):
sample_list.append(random.sample(start_time_list, 1)[0])
Expand Down Expand Up @@ -346,17 +350,36 @@ def main(argv):
avg_latency,
result
])
else:
# Write results to a .csv file
with open(OUTPUT_SUMMARY_CSV_FILE, 'a') as f:
writer = csv.writer(f)
writer.writerow([
filename,
sample_rate_for_csv,
"N/A",
top_occurance,
str(topPercentage) + "%",
sec_occurance,
str(secPercentage) + "%",
avg_latency,
"N/A"
])


if ground_truth_compare:
# Summary of results
print("\n\n Correctly predicted %d/%d\n" %(correct_predictions, total_samples))
print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE))

print("\n See %s for summary\n" %(OUTPUT_SUMMARY_CSV_FILE))

elif os.path.isfile(path):
print("\nIt is a normal file", path)
else:
print("It is a special file (socket, FIFO, device file)" , path)

print("Done.\n")

if __name__ == "__main__":
import sys
sys.exit(main(sys.argv))

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -47,15 +47,15 @@
"metadata": {},
"outputs": [],
"source": [
"!python inference_commonVoice.py -p /data/commonVoice/test"
"!python inference_commonVoice.py -p ${COMMON_VOICE_PATH}/processed_data/test"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## inference_custom.py for Custom Data \n",
"To generate an overall results output summary, the audio_ground_truth_labels.csv file needs to be modified with the name of the audio file and expected audio label (i.e. en for English). By default, this is disabled but if desired, the *--ground_truth_compare* can be used. To run inference on custom data, you must specify a folder with WAV files and pass the path in as an argument. "
"To run inference on custom data, you must specify a folder with .wav files and pass the path in as an argument. You can do so by creating a folder named `data_custom` and then copy 1 or 2 .wav files from your test dataset into it. .mp3 files will NOT work. "
]
},
{
Expand All @@ -65,7 +65,7 @@
"### Randomly select audio clips from audio files for prediction\n",
"python inference_custom.py -p DATAPATH -d DURATION -s SIZE\n",
"\n",
"An output file output_summary.csv will give the summary of the results."
"An output file `output_summary.csv` will give the summary of the results."
]
},
{
Expand Down Expand Up @@ -104,6 +104,8 @@
"### Optimizations with Intel® Extension for PyTorch (IPEX) \n",
"python inference_custom.py -p data_custom -d 3 -s 50 --vad --ipex --verbose \n",
"\n",
"This will apply ipex.optimize to the model(s) and TorchScript. You can also add the --bf16 option along with --ipex to run in the BF16 data type, supported on 4th Gen Intel® Xeon® Scalable processors and newer.\n",
"\n",
"Note that the *--verbose* option is required to view the latency measurements. "
]
},
Expand All @@ -121,7 +123,7 @@
"metadata": {},
"source": [
"## Quantization with Intel® Neural Compressor (INC)\n",
"To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to */data/commonVoice/dev* which was generated from the data preprocessing scripts in the *Training* folder. "
"To improve inference latency, Intel® Neural Compressor (INC) can be used to quantize the trained model from FP32 to INT8 by running quantize_model.py. The *-datapath* argument can be used to specify a custom evaluation dataset but by default it is set to `$COMMON_VOICE_PATH/processed_data/dev` which was generated from the data preprocessing scripts in the `Training` folder. "
]
},
{
Expand All @@ -130,14 +132,46 @@
"metadata": {},
"outputs": [],
"source": [
"!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/dev"
"!python quantize_model.py -p ./lang_id_commonvoice_model -datapath $COMMON_VOICE_PATH/processed_data/dev"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After quantization, the model will be stored in lang_id_commonvoice_model_INT8 and neural_compressor.utils.pytorch.load will have to be used to load the quantized model for inference. If self.language_id is the original model and data_path is the path to the audio file:\n",
"\n",
"```\n",
"from neural_compressor.utils.pytorch import load\n",
"model_int8 = load(\"./lang_id_commonvoice_model_INT8\", self.language_id)\n",
"signal = self.language_id.load_audio(data_path)\n",
"prediction = self.model_int8(signal)\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"After quantization, the model will be stored in *lang_id_commonvoice_model_INT8* and *neural_compressor.utils.pytorch.load* will have to be used to load the quantized model for inference. "
"The code above is integrated into inference_custom.py. You can now run inference on your data using this INT8 model:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!python inference_custom.py -p data_custom -d 3 -s 50 --vad --int8_model --verbose"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### (Optional) Comparing Predictions with Ground Truth\n",
"\n",
"You can choose to modify audio_ground_truth_labels.csv to include the name of the audio file and expected audio label (like, en for English), then run inference_custom.py with the --ground_truth_compare option. By default, this is disabled."
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
from neural_compressor.utils.pytorch import load
from speechbrain.pretrained import EncoderClassifier

DEFAULT_EVAL_DATA_PATH = "/data/commonVoice/dev"

def prepare_dataset(path):
data_list = []
for dir_name in os.listdir(path):
Expand All @@ -33,7 +31,7 @@ def main(argv):
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('-p', type=str, required=True, help="Path to the model to be optimized")
parser.add_argument('-datapath', type=str, default=DEFAULT_EVAL_DATA_PATH, help="Path to evaluation dataset")
parser.add_argument('-datapath', type=str, required=True, help="Path to evaluation dataset")
args = parser.parse_args()

model_path = args.p
Expand Down
Binary file not shown.
Binary file not shown.
Loading