wenet-e2e · czy97 · May 17, 2024 · May 14, 2024 · May 14, 2024 · May 14, 2024
diff --git a/README.md b/README.md
@@ -60,6 +60,7 @@ pre-commit install  # for clean and tidy code
 ```
 
 ## 🔥 News
+* 2024.05.15: Add support for score calibration, see [#320](https://github.com/wenet-e2e/wespeaker/pull/320)
 * 2024.04.25: Add support for the gemini-dfresnet model, see [#291](https://github.com/wenet-e2e/wespeaker/pull/291)
 * 2024.04.23: Support MNN inference engine in runtime, see [#310](https://github.com/wenet-e2e/wespeaker/pull/310)
 * 2024.04.02: Release [Wespeaker document](http://wenet.org.cn/wespeaker) with detailed model-training tutorials, introduction of various runtime platforms, etc.
@@ -72,11 +73,13 @@ pre-commit install  # for clean and tidy code
 ## Recipes
 
 * [VoxCeleb](https://github.com/wenet-e2e/wespeaker/tree/master/examples/voxceleb): Speaker Verification recipe on the [VoxCeleb dataset](https://www.robots.ox.ac.uk/~vgg/data/voxceleb/)
+    * 🔥 UPDATE 2024.05.15: We support score calibration for Voxceleb and achieve better performance!
     * 🔥 UPDATE 2023.07.10: We support self-supervised learning recipe on Voxceleb! Achieving **2.627%** (ECAPA_TDNN_GLOB_c1024) EER on vox1-O-clean test set without any labels.
     * 🔥 UPDATE 2022.10.31: We support deep r-vector up to the 293-layer version! Achieving **0.447%/0.043** EER/mindcf on vox1-O-clean test set
     * 🔥 UPDATE 2022.07.19: We apply the same setups as the CNCeleb recipe, and obtain SOTA performance considering the open-source systems
       - EER/minDCF on vox1-O-clean test set are **0.723%/0.069** (ResNet34) and **0.728%/0.099** (ECAPA_TDNN_GLOB_c1024), after LM fine-tuning and AS-Norm
 * [CNCeleb](https://github.com/wenet-e2e/wespeaker/tree/master/examples/cnceleb/v2): Speaker Verification recipe on the [CnCeleb dataset](http://cnceleb.org/)
+    * 🔥 UPDATE 2024.05.16: We support score calibration for Cnceleb and achieve better EER.
     * 🔥 UPDATE 2022.10.31: 221-layer ResNet achieves **5.655%/0.330**  EER/minDCF
     * 🔥 UPDATE 2022.07.12: We migrate the winner system of CNSRC 2022 [report](https://aishell-cnsrc.oss-cn-hangzhou.aliyuncs.com/T082.pdf) [slides](https://aishell-cnsrc.oss-cn-hangzhou.aliyuncs.com/T082-ZhengyangChen.pdf)
       - EER/minDCF reduction from 8.426%/0.487 to **6.492%/0.354** after large margin fine-tuning and AS-Norm

diff --git a/examples/cnceleb/v2/README.md b/examples/cnceleb/v2/README.md
@@ -2,6 +2,14 @@
 
 * Setup: fbank80, num_frms200, epoch150, ArcMargin, aug_prob0.6, speed_perturb (no spec_aug)
 * test_trials: CNC-Eval-Avg.lst
+
+* 🔥 UPDATE 2024.05.16: We update to support score calibration for cnceleb. It will improve the EER but degrade minDCF comparing with asnorm results.
+| Model                             | Params    | FLOPs   | LM  | AS-Norm   | Score Calibration | EER (%)   | minDCF (p=0.01)  |
+| :------------------------------   | :-------: | :-----: | :-: | :-------: | :---------------: | :-------: | :--------------: |
+| ResNet34-TSTP-emb256              | 6.63M     | 4.55 G  | ×   | ×         | ×                 | 7.124     | 0.408            |
+|                                   |           |         | ×   | √         | ×                 | 6.742     | 0.367            |
+|                                   |           |         | ×   | √         | √                 | 6.336     | 0.374            |
+
 * 🔥 UPDATE 2022.07.12: We update this recipe according to the setups in the winning system of CNSRC 2022, and get obvious performance improvement compared with the old recipe. Check the [commit1](https://github.com/wenet-e2e/wespeaker/pull/63/commits/b08804987b3bbb26f4963cedf634058474c743dd), [commit2](https://github.com/wenet-e2e/wespeaker/pull/66/commits/6f6af29197f0aa0a5d1b1993b7feb2f41b97891f) for details.
     * LR scheduler warmup from 0
     * Remove one embedding layer

diff --git a/examples/cnceleb/v2/local/score_calibration.sh b/examples/cnceleb/v2/local/score_calibration.sh
@@ -0,0 +1,112 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Chengdong Liang ([email protected])
+#               2024 Zhengyang Chen ([email protected])
+#               2024 Bing Han ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+score_norm_method="asnorm"  # asnorm/snorm
+cohort_set=cnceleb_train
+calibration_trial="cn_dev_cali.kaldi"
+top_n=100
+exp_dir=''
+trials="CNC-Eval-Concat.lst CNC-Eval-Avg.lst"
+data=data
+
+stage=-1
+stop_stage=-1
+
+. tools/parse_options.sh
+. path.sh
+
+output_name=${cohort_set}_${score_norm_method}
+[ "${score_norm_method}" == "asnorm" ] && output_name=${output_name}${top_n}
+trials_dir=${data}/eval/trials
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  echo "Score calibration set"
+  # Compute duration
+  if [ ! -f ${data}/cnceleb_train/utt2dur ]; then
+    cat ${data}/statistics/cnceleb2/comb_stat/utt2dur ${data}/statistics/cnceleb1/comb_stat/utt2dur | awk '{print $1".wav", $2}' > ${data}/cnceleb_train/utt2dur
+  fi
+  # generate trial for calibration
+  if [ ! -e ${trials_dir}/${calibration_trial} ]; then
+    python tools/generate_calibration_trial.py --utt2dur ${data}/cnceleb_train/utt2dur --trial_path ${trials_dir}/${calibration_trial} --each_trial_num 20000
+  fi
+
+  python wespeaker/bin/score.py \
+    --exp_dir ${exp_dir} \
+    --eval_scp_path ${exp_dir}/embeddings/cnceleb_train/xvector.scp \
+    --cal_mean True \
+    --cal_mean_dir ${exp_dir}/embeddings/cnceleb_train \
+    ${trials_dir}/${calibration_trial}
+
+  python wespeaker/bin/score_norm.py \
+    --score_norm_method $score_norm_method \
+    --top_n $top_n \
+    --trial_score_file $exp_dir/scores/${calibration_trial}.score \
+    --score_norm_file $exp_dir/scores/${output_name}_${calibration_trial}.score \
+    --cohort_emb_scp ${exp_dir}/embeddings/${cohort_set}/spk_xvector.scp \
+    --eval_emb_scp ${exp_dir}/embeddings/cnceleb_train/xvector.scp \
+    --mean_vec_path ${exp_dir}/embeddings/cnceleb_train/mean_vec.npy
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  echo "Prepare calibration factors"
+  # gather calibration factor
+  mkdir -p ${exp_dir}/scores/calibration
+  for x in ${calibration_trial} $trials; do
+    python wespeaker/bin/score_calibration.py "gather_calibration_factors" \
+      --wav_dur_scp ${exp_dir}/scores/calibration/utt2dur \
+      --max_dur 20 \
+      --score_norm_file ${exp_dir}/scores/${output_name}_${x}.score \
+      --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${x}.calibration \
+      --drop_duration True
+  done
+fi
+
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  echo "Train calibration model"
+  python wespeaker/bin/score_calibration.py "train_calibration_model" \
+    --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${calibration_trial}.calibration \
+    --save_model_path ${exp_dir}/scores/calibration/calibration_model.pt
+fi
+
+cali_output_name=cali_${output_name}
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  echo "Infer calibration model"
+  for x in ${trials}; do
+    python wespeaker/bin/score_calibration.py "infer_calibration" \
+      --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${x}.calibration \
+      --save_model_path ${exp_dir}/scores/calibration/calibration_model.pt \
+      --calibration_score_file ${exp_dir}/scores/${cali_output_name}_${x}.score
+  done
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  echo "compute metrics"
+  for x in ${trials}; do
+    scores_dir=${exp_dir}/scores
+    python wespeaker/bin/compute_metrics.py \
+      --p_target 0.01 \
+      --c_fa 1 \
+      --c_miss 1 \
+      ${scores_dir}/${cali_output_name}_${x}.score \
+      2>&1 | tee -a ${scores_dir}/cnc_cali_${score_norm_method}${top_n}_result
+
+    python wespeaker/bin/compute_det.py \
+      ${scores_dir}/${cali_output_name}_${x}.score
+  done
+fi
diff --git a/examples/cnceleb/v2/run.sh b/examples/cnceleb/v2/run.sh
@@ -3,6 +3,7 @@
 # Copyright 2022 Hongji Wang ([email protected])
 #           2022 Chengdong Liang ([email protected])
 #           2022 Zhengyang Chen ([email protected])
+#           2024 Bing Han ([email protected])
 
 . ./path.sh || exit 1
 
@@ -114,7 +115,25 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
     --trials "$trials"
 fi
 
+# ================== Score Calibration ==================
+# It shoule be noted that the score calibration is optio-
+# nal. For CN-Celeb, it will improve the EER but degrade
+# minDCF.
+
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  echo "Score calibration ..."
+  local/score_calibration.sh \
+    --stage 1 --stop-stage 5 \
+    --score_norm_method $score_norm_method \
+    --calibration_trial "cn_dev_cali.kaldi" \
+    --cohort_set cnceleb_train \
+    --top_n $top_n \
+    --exp_dir $exp_dir \
+    --data ${data} \
+    --trials "$trials"
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
   echo "Export the best model ..."
   python wespeaker/bin/export_jit.py \
     --config $exp_dir/config.yaml \
@@ -130,13 +149,13 @@ fi
 # proces will take longer segment as input and will take
 # up more gpu memory.
 
-if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
   echo "Large margin fine-tuning ..."
   lm_exp_dir=${exp_dir}-LM
   mkdir -p ${lm_exp_dir}/models
   # Use the pre-trained average model to initialize the LM training
   cp ${exp_dir}/models/avg_model.pt ${lm_exp_dir}/models/model_0.pt
-  bash run.sh --stage 3 --stop_stage 7 \
+  bash run.sh --stage 3 --stop_stage 8 \
       --data ${data} \
       --data_type ${data_type} \
       --config ${lm_config} \

diff --git a/examples/voxceleb/v2/README.md b/examples/voxceleb/v2/README.md
@@ -4,6 +4,17 @@
 * Scoring: cosine (sub mean of vox2_dev)
 * Metric: EER(%)
 
+* 🔥 UPDATE 2024.05.14: We support score calibration strategy (see [QMF](https://arxiv.org/pdf/2010.11255.pdf)), and obtain better performance.
+
+| Model | Params | Flops | LM | AS-Norm | QMF | vox1-O-clean | vox1-E-clean | vox1-H-clean |
+|:------|:------:|:------|:--:|:-------:|:---:|:------------:|:------------:|:------------:|
+| ResNet34-TSTP-emb256 | 6.63M | 4.55G | × | × | × | 0.862 | 1.053 | 1.966 |
+|                      |       |       | × | √ | × | 0.792 | 0.970 | 1.728 |
+|                      |       |       | × | √ | √ | 0.718 | 0.911 | 1.606 |
+|                      |       |       | √ | × | × | 0.797 | 0.943 | 1.702 |
+|                      |       |       | √ | √ | × | 0.723 | 0.874 | 1.537 |
+|                      |       |       | √ | √ | √ | 0.659 | 0.821 | 1.437 |
+
 * 🔥 UPDATE 2022.07.19: We apply the same setups as the winning system of CNSRC 2022 (see [cnceleb](https://github.com/wenet-e2e/wespeaker/tree/master/examples/cnceleb/v2) recipe for details), and obtain significant performance improvement.
     * LR scheduler warmup from 0
     * Remove one embedding layer in ResNet models

diff --git a/examples/voxceleb/v2/local/score_calibration.sh b/examples/voxceleb/v2/local/score_calibration.sh
@@ -0,0 +1,114 @@
+#!/bin/bash
+
+# Copyright (c) 2022 Chengdong Liang ([email protected])
+#               2024 Zhengyang Chen ([email protected])
+#               2024 Bing Han ([email protected])
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+score_norm_method="asnorm"  # asnorm/snorm
+cohort_set=vox2_dev
+calibration_trial="vox2_cali.kaldi"
+top_n=100
+exp_dir=''
+trials="vox1_O_cleaned.kaldi vox1_E_cleaned.kaldi vox1_H_cleaned.kaldi"
+data=data
+
+stage=-1
+stop_stage=-1
+
+. tools/parse_options.sh
+. path.sh
+
+output_name=${cohort_set}_${score_norm_method}
+[ "${score_norm_method}" == "asnorm" ] && output_name=${output_name}${top_n}
+trials_dir=${data}/vox1/trials
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
+  echo "Score calibration set"
+  # Compute duration
+  for dset in vox2_dev vox1; do
+    if [ ! -f ${data}/${dset}/utt2dur ]; then
+      python tools/wav2dur.py ${data}/${dset}/wav.scp ${data}/${dset}/utt2dur > ${data}/${dset}/dur.log
+    fi
+  done
+  # generate trial for calibration
+  if [ ! -e ${trials_dir}/${calibration_trial} ]; then
+    python tools/generate_calibration_trial.py --utt2dur ${data}/vox2_dev/utt2dur --trial_path ${trials_dir}/${calibration_trial}
+  fi
+
+  python wespeaker/bin/score.py \
+    --exp_dir ${exp_dir} \
+    --eval_scp_path ${exp_dir}/embeddings/vox2_dev/xvector.scp \
+    --cal_mean True \
+    --cal_mean_dir ${exp_dir}/embeddings/vox2_dev \
+    ${trials_dir}/${calibration_trial}
+
+  python wespeaker/bin/score_norm.py \
+    --score_norm_method $score_norm_method \
+    --top_n $top_n \
+    --trial_score_file $exp_dir/scores/${calibration_trial}.score \
+    --score_norm_file $exp_dir/scores/${output_name}_${calibration_trial}.score \
+    --cohort_emb_scp ${exp_dir}/embeddings/${cohort_set}/spk_xvector.scp \
+    --eval_emb_scp ${exp_dir}/embeddings/vox2_dev/xvector.scp \
+    --mean_vec_path ${exp_dir}/embeddings/vox2_dev/mean_vec.npy
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ]; then
+  echo "Prepare calibration factors"
+  # gather calibration factor
+  mkdir -p ${exp_dir}/scores/calibration
+  cat ${data}/vox1/utt2dur ${data}/vox2_dev/utt2dur > ${exp_dir}/scores/calibration/utt2dur
+  for x in ${calibration_trial} $trials; do
+    python wespeaker/bin/score_calibration.py "gather_calibration_factors" \
+      --wav_dur_scp ${exp_dir}/scores/calibration/utt2dur \
+      --max_dur 20 \
+      --score_norm_file ${exp_dir}/scores/${output_name}_${x}.score \
+      --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${x}.calibration
+  done
+fi
+
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ]; then
+  echo "Train calibration model"
+  python wespeaker/bin/score_calibration.py "train_calibration_model" \
+    --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${calibration_trial}.calibration \
+    --save_model_path ${exp_dir}/scores/calibration/calibration_model.pt
+fi
+
+cali_output_name=cali_${output_name}
+if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
+  echo "Infer calibration model"
+  for x in ${trials}; do
+    python wespeaker/bin/score_calibration.py "infer_calibration" \
+      --calibration_factor_file ${exp_dir}/scores/calibration/${output_name}_${x}.calibration \
+      --save_model_path ${exp_dir}/scores/calibration/calibration_model.pt \
+      --calibration_score_file ${exp_dir}/scores/${cali_output_name}_${x}.score
+  done
+fi
+
+if [ $stage -le 5 ] && [ $stop_stage -ge 5 ]; then
+  echo "compute metrics"
+  for x in ${trials}; do
+    scores_dir=${exp_dir}/scores
+    python wespeaker/bin/compute_metrics.py \
+      --p_target 0.01 \
+      --c_fa 1 \
+      --c_miss 1 \
+      ${scores_dir}/${cali_output_name}_${x}.score \
+      2>&1 | tee -a ${scores_dir}/vox1_cali_${score_norm_method}${top_n}_result
+
+    python wespeaker/bin/compute_det.py \
+      ${scores_dir}/${cali_output_name}_${x}.score
+  done
+fi
diff --git a/examples/voxceleb/v2/run.sh b/examples/voxceleb/v2/run.sh
@@ -114,20 +114,33 @@ if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
 fi
 
 if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
+  echo "Score calibration ..."
+  local/score_calibration.sh \
+    --stage 1 --stop-stage 5 \
+    --score_norm_method $score_norm_method \
+    --calibration_trial "vox2_cali.kaldi" \
+    --cohort_set vox2_dev \
+    --top_n $top_n \
+    --data ${data} \
+    --exp_dir $exp_dir \
+    --trials "$trials"
+fi
+
+if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
   echo "Export the best model ..."
   python wespeaker/bin/export_jit.py \
     --config $exp_dir/config.yaml \
     --checkpoint $exp_dir/models/avg_model.pt \
     --output_file $exp_dir/models/final.zip
 fi
 
-if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
+if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
   echo "Large margin fine-tuning ..."
   lm_exp_dir=${exp_dir}-LM
   mkdir -p ${lm_exp_dir}/models
   # Use the pre-trained average model to initialize the LM training
   cp ${exp_dir}/models/avg_model.pt ${lm_exp_dir}/models/model_0.pt
-  bash run.sh --stage 3 --stop_stage 7 \
+  bash run.sh --stage 3 --stop_stage 8 \
       --data ${data} \
       --data_type ${data_type} \
       --config ${lm_config} \