Merge branch 'main' into handle_retro_import_error

NVIDIA · Dec 5, 2024 · 5a94ed2 · 5a94ed2
2 parents 16d85f2 + 1f492d3
commit 5a94ed2
Show file tree

Hide file tree

Showing 27 changed files with 1,445 additions and 211 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -2109,10 +2109,10 @@ jobs:
     #    }
     #  }
 
-  L2_Megatron_LM_To_NeMo_Conversion:
+  Optional_L2_Megatron_LM_To_NeMo_Conversion:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -2223,6 +2223,7 @@ jobs:
           --model_type gpt \
           --hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \
           --convert_mlm
+      IS_OPTIONAL: true
 
   L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
     needs: [cicd-test-container-setup]
@@ -3322,10 +3323,10 @@ jobs:
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
 
-  L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
+  Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -3390,6 +3391,7 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf examples/nlp/language_modeling/t5_pretrain_results
         rm -rf examples/nlp/language_modeling/t5_index_mappings
+      IS_OPTIONAL: true
 
   L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
     needs: [cicd-test-container-setup]
@@ -4401,6 +4403,33 @@ jobs:
         --pp_size 1 \
         --mbs 1 --packed
 
+  L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 3 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft canonical_lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+        
+        python tests/collections/llm/gpt_finetuning.py \
+        --restore_path /home/TestData/nemo2_ckpt/llama_68M \
+        --devices 2 \
+        --max_steps 6 \
+        --experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
+        --peft canonical_lora \
+        --tp_size 1 \
+        --pp_size 1 \
+        --mbs 1 --packed
+
   L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
@@ -4430,10 +4459,10 @@ jobs:
         --model mixtral \
         --dist-opt
 
-  L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1:
+  Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1:
     needs: [cicd-test-container-setup]
     uses: ./.github/workflows/_test_template.yml
-    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true'
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true'
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
@@ -4444,6 +4473,7 @@ jobs:
         --mbs 1 \
         --model mixtral \
         --dist-opt
+      IS_OPTIONAL: true
 
   L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1:
     needs: [cicd-test-container-setup]
@@ -4587,7 +4617,7 @@ jobs:
       - L2_RAG_Pipeline_Generating
       - L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_Skip_Train
-      - L2_Megatron_LM_To_NeMo_Conversion
+      # - Optional_L2_Megatron_LM_To_NeMo_Conversion
       - L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
@@ -4609,7 +4639,7 @@ jobs:
       - L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
-      - L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
+      # - Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
       - L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
       - L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
       - L2_Megatron_Core_T5_Eval
@@ -4646,9 +4676,10 @@ jobs:
       - L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
       - L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
+      - L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED
       - L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
       - L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
-      - L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
+      # - Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
       - L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
       - L2_NEMO_2_LoRA_MERGE

diff --git a/.github/workflows/secrets-detector.yml b/.github/workflows/secrets-detector.yml
@@ -25,11 +25,6 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
         with:
-          # setup repository and ref for PRs, see
-          # https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
-          repository: ${{ github.event.pull_request.head.repo.full_name }}
-          ref: ${{ github.event.pull_request.head.ref }}
-          # custom token is required to trigger actions after reformatting + pushing
           fetch-depth: 0
           token: ${{ secrets.NEMO_REFORMAT_TOKEN }}
 

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -54,15 +54,15 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
 # Install NeMo requirements
 ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
 ARG MODELOPT_VERSION=0.19.0
-ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3
+ARG MCORE_TAG=2f67f35b4a9e5596034e93eddeed07009bcf630c
 
 ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
 RUN \
---mount=type=bind,source=requirements,target=requirements \
---mount=type=bind,source=tools,target=tools \
---mount=type=bind,source=setup.py,target=setup.py \
---mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
---mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
+  --mount=type=bind,source=requirements,target=requirements \
+  --mount=type=bind,source=tools,target=tools \
+  --mount=type=bind,source=setup.py,target=setup.py \
+  --mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
+  --mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
 pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
 "transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
 "megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \

diff --git a/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml b/examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
@@ -24,11 +24,12 @@ spl_tokens:
 model:
   sample_rate: 16000
   label_smoothing: 0.0
-  context_len_for_AR_decoding: 5 # Length of input prompt tokens. For example, in Canary models, we use [BOS,src_lang,task,tgt_lang,pnc] and thus the length is 5
+  use_loss_mask_for_prompt: false
   log_prediction: true # enables logging sample predictions in the output during training
 
   # Important ! Set the prompt format to the class you need
   prompt_format: ???   # Options supported: ["canary"]
+  prompt_defaults: null
 
   model_defaults:
     asr_enc_hidden: 1024

diff --git a/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py b/nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
@@ -16,6 +16,7 @@
 
 import torch.utils.data
 from lhotse import CutSet
+from lhotse.cut import MixedCut
 from lhotse.dataset import AudioSamples
 from lhotse.dataset.collation import collate_vectors
 
@@ -99,7 +100,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
             prompt_lens=prompt_lens,
             prompted_transcript=prompts_with_answers,
             prompted_transcript_lens=prompts_with_answers_lens,
-            cuts=cuts.drop_in_memory_data(),
+            cuts=_drop_in_memory_data(cuts),
         )
 
     def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
@@ -111,3 +112,24 @@ def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple
 
 class ProbablyIncorrectLanguageKeyError(RuntimeError):
     pass
+
+
+def _drop_in_memory_data(
+    cuts: CutSet,
+    _fields=frozenset(MixedCut.__dataclass_fields__.keys()),
+) -> CutSet:
+    """Workaround for an edge case in cuts.drop_in_memory_data() on MixedCut with Lhotse<1.29.0"""
+    ans = []
+    for c in cuts:
+        # Not a mixed cut or a mixed cut that wasn't assigned any extra attributes.
+        if not isinstance(c, MixedCut) or _fields.issuperset(c.__dict__.keys()):
+            ans.append(c.drop_in_memory_data())
+        else:
+            extra_attrs = {k: v for k, v in c.__dict__.items() if k not in _fields}
+            for k in extra_attrs:
+                delattr(c, k)
+            ans.append(c.drop_in_memory_data())
+            for k, v in extra_attrs.items():
+                setattr(ans[-1], k, v)
+                setattr(c, k, v)
+    return CutSet(ans)
diff --git a/nemo/collections/asr/models/aed_multitask_models.py b/nemo/collections/asr/models/aed_multitask_models.py
@@ -70,7 +70,8 @@ def lens_to_mask(lens, max_length):
     Create a mask from a tensor of lengths.
     """
     batch_size = lens.shape[0]
-    mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
+    arange = torch.arange(max_length, device=lens.device)
+    mask = arange.expand(batch_size, max_length) < lens.unsqueeze(1)
     return mask
 
 
@@ -697,24 +698,34 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
             return torch.tensor([0.0])
 
         input_ids, labels = batch.get_decoder_inputs_outputs()
+        input_ids_lens = batch.prompted_transcript_lens - 1
+
+        num_frames = batch.audio_lens.sum().float()
+        num_tokens = batch.prompted_transcript_lens.sum().float()
+        tot_frames = torch.as_tensor(batch.audio.numel(), device=num_frames.device, dtype=torch.float)
+        tot_tokens = torch.as_tensor(batch.prompted_transcript.numel(), device=num_frames.device, dtype=torch.float)
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch.audio,
             input_signal_length=batch.audio_lens,
             transcript=input_ids,
-            transcript_length=batch.prompted_transcript_lens,
+            transcript_length=input_ids_lens,
         )
 
-        audio_loss = self.loss(log_probs=transf_log_probs, labels=labels)
+        # Mask components: 1) discard padding  &  2) discard prompt (notice the negation)
+        # For a full decoder sequence O with len M, the loss mask skips the first element,
+        # covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
+        if self.cfg.get("use_loss_mask_for_prompt", False):
+            maxlen = batch.prompted_transcript.shape[1] - 1
+            loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
+        else:
+            loss_mask = None
+        audio_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)
 
-        num_frames = batch.audio_lens.sum()
-        num_tokens = batch.prompted_transcript_lens.sum()
-        tot_frames = batch.audio.numel()
-        tot_tokens = batch.prompted_transcript.numel()
         tensorboard_logs = {
             'train_loss': audio_loss,
-            'learning_rate': self._optimizer.param_groups[0]['lr'],
-            'batch_size': batch.audio.shape[0],
+            'learning_rate': torch.as_tensor(self._optimizer.param_groups[0]['lr']),
+            'batch_size': torch.as_tensor(batch.audio.shape[0]),
             'num_frames': num_frames,
             'num_tokens': num_tokens,
             'input_to_padding_ratio': num_frames / tot_frames,
@@ -725,6 +736,7 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
 
     def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, dataloader_idx=0, eval_mode="val"):
         input_ids, labels = batch.get_decoder_inputs_outputs()
+        input_ids_lens = batch.prompted_transcript_lens - 1
 
         transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
             input_signal=batch.audio,
@@ -733,11 +745,19 @@ def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, datalo
             transcript_length=batch.prompted_transcript_lens,
         )
 
-        transf_loss = self.loss(log_probs=transf_log_probs, labels=labels)
-        self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1])
-        output_dict = {
-            f'{eval_mode}_loss': transf_loss,
-        }
+        # Mask components: 1) discard padding  &  2) discard prompt (notice the negation)
+        # For a full decoder sequence O with len M, the loss mask skips the first element,
+        # covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
+        if self.cfg.get("use_loss_mask_for_prompt", False):
+            maxlen = batch.prompted_transcript.shape[1] - 1
+            loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
+            num_measurements = loss_mask.long().sum()
+        else:
+            loss_mask = None
+            num_measurements = transf_log_probs.shape[0] * transf_log_probs.shape[1]
+        transf_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)
+        self.val_loss(loss=transf_loss, num_measurements=num_measurements)
+        output_dict = {f'{eval_mode}_loss': transf_loss}
 
         self.wer.update(
             predictions=enc_states,
@@ -983,6 +1003,8 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
             'text_field': config.get('text_field', 'answer'),
             'lang_field': config.get('lang_field', 'target_lang'),
             'channel_selector': config.get('channel_selector', None),
+            'pad_min_duration': config.get('pad_min_duration', 1.0),
+            'pad_direction': config.get('pad_direction', 'both'),
         }
 
         temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))

diff --git a/nemo/collections/asr/modules/transformer/transformer_modules.py b/nemo/collections/asr/modules/transformer/transformer_modules.py
@@ -58,27 +58,7 @@ def _build_pos_enc(self, hidden_size, max_sequence_length, device=None):
         self.register_buffer('pos_enc', pos_enc)
 
     def forward(self, position_ids):
-        max_pos_id = position_ids.max()
-        # update positional encoding if needed
-        if max_pos_id >= self._max_sequence_length:
-            logging.warning(
-                f'Max position id {max_pos_id} is greater than max sequence length {self._max_sequence_length}. Expanding position embeddings just for this batch. This is not expected to work very well. Consider chunking your input into smaller sequences.'
-            )
-            self._build_pos_enc(
-                hidden_size=self._hidden_size,
-                max_sequence_length=max_pos_id + 1,
-                device=position_ids.device,
-            )
-
         embeddings = torch.embedding(self.pos_enc, position_ids)
-
-        # Revert expansion of position embeddings since this wall checkpoint size mismatches.
-        if max_pos_id >= self._max_sequence_length:
-            self._build_pos_enc(
-                hidden_size=self._hidden_size,
-                max_sequence_length=self._max_sequence_length,
-                device=position_ids.device,
-            )
         return embeddings
 
 

diff --git a/nemo/collections/common/data/lhotse/dataloader.py b/nemo/collections/common/data/lhotse/dataloader.py
@@ -135,6 +135,9 @@ class LhotseDataLoadingConfig:
     rir_enabled: bool = False
     rir_path: str | None = None  # str, must point to a lhotse RecordingSet manifest
     rir_prob: float = 0.5
+    #   f. Padding to a minimum duration. Examples shorter than this will be padded, others are unaffected.
+    pad_min_duration: Optional[float] = None
+    pad_direction: str = "right"  # "right" | "left" | "both" | "random"
 
     # 5. Other Lhotse options.
     text_field: str = "text"  # key to read the transcript from
@@ -278,6 +281,9 @@ def get_lhotse_dataloader_from_config(
             keep_excessive_supervisions=config.keep_excessive_supervisions,
         )
 
+    if config.pad_min_duration is not None:
+        cuts = cuts.pad(duration=config.pad_min_duration, direction=config.pad_direction, preserve_id=True)
+
     # Duration filtering, same as native NeMo dataloaders.
     # We can filter after the augmentations because they are applied only when calling load_audio().
     cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))

diff --git a/nemo/collections/common/prompts/__init__.py b/nemo/collections/common/prompts/__init__.py
@@ -1,4 +1,5 @@
 from nemo.collections.common.prompts.canary import CanaryPromptFormatter
+from nemo.collections.common.prompts.canary2 import Canary2PromptFormatter
 from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn
 from nemo.collections.common.prompts.formatter import PromptFormatter
 from nemo.collections.common.prompts.gemma import GemmaPromptFormatter