Skip to content

Commit

Permalink
Merge branch 'main' into handle_retro_import_error
Browse files Browse the repository at this point in the history
  • Loading branch information
guyueh1 committed Dec 5, 2024
2 parents 16d85f2 + 1f492d3 commit 5a94ed2
Show file tree
Hide file tree
Showing 27 changed files with 1,445 additions and 211 deletions.
49 changes: 40 additions & 9 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2109,10 +2109,10 @@ jobs:
# }
# }

L2_Megatron_LM_To_NeMo_Conversion:
Optional_L2_Megatron_LM_To_NeMo_Conversion:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_LM_To_NeMo_Conversion') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
Expand Down Expand Up @@ -2223,6 +2223,7 @@ jobs:
--model_type gpt \
--hparams_file /tmp/nemo_conversion_ckpt/megatron_gpt/version_0/hparams.yaml \
--convert_mlm
IS_OPTIONAL: true

L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -3322,10 +3323,10 @@ jobs:
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
Expand Down Expand Up @@ -3390,6 +3391,7 @@ jobs:
AFTER_SCRIPT: |
rm -rf examples/nlp/language_modeling/t5_pretrain_results
rm -rf examples/nlp/language_modeling/t5_index_mappings
IS_OPTIONAL: true

L2_Megatron_T5_w_Mixture_of_Expert_Pretraining:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4401,6 +4403,33 @@ jobs:
--pp_size 1 \
--mbs 1 --packed
L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M \
--devices 2 \
--max_steps 3 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
--peft canonical_lora \
--tp_size 1 \
--pp_size 1 \
--mbs 1 --packed
python tests/collections/llm/gpt_finetuning.py \
--restore_path /home/TestData/nemo2_ckpt/llama_68M \
--devices 2 \
--max_steps 6 \
--experiment_dir /tmp/nemo2_gpt_finetune/${{ github.run_id }} \
--peft canonical_lora \
--tp_size 1 \
--pp_size 1 \
--mbs 1 --packed
L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
Expand Down Expand Up @@ -4430,10 +4459,10 @@ jobs:
--model mixtral \
--dist-opt
L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1:
Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1:
needs: [cicd-test-container-setup]
uses: ./.github/workflows/_test_template.yml
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true'
if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1') || needs.cicd-test-container-setup.outputs.all == 'true'
with:
RUNNER: self-hosted-azure
SCRIPT: |
Expand All @@ -4444,6 +4473,7 @@ jobs:
--mbs 1 \
--model mixtral \
--dist-opt
IS_OPTIONAL: true

L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1:
needs: [cicd-test-container-setup]
Expand Down Expand Up @@ -4587,7 +4617,7 @@ jobs:
- L2_RAG_Pipeline_Generating
- L2_Megatron_GPT_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_Skip_Train
- L2_Megatron_LM_To_NeMo_Conversion
# - Optional_L2_Megatron_LM_To_NeMo_Conversion
- L2_Megatron_GPT_with_Rope_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_ResetLR_Pretraining_and_Resume_Training_TP2
- L2_Megatron_GPT_with_Drop_Optimizer_States_TP2
Expand All @@ -4609,7 +4639,7 @@ jobs:
- L2_Megatron_Core_T5_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_ALiBi_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_with_KERPLE_Pretraining_and_Resume_Training_TP2
- L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
# - Optional_L2_Megatron_T5_Pretraining_and_Resume_Training_PP2
- L2_Megatron_T5_w_Mixture_of_Expert_Pretraining
- L2_Megatron_UL2_Pretraining_and_Resume_Training_TP2
- L2_Megatron_Core_T5_Eval
Expand Down Expand Up @@ -4646,9 +4676,10 @@ jobs:
- L2_NeMo_2_GPT_LoRA_TP2PP1_MBS2
- L2_NeMo_2_GPT_LoRA_TP1PP1_MBS1_PACKED
- L2_NeMo_2_GPT_DoRA_TP1PP1_MBS1_PACKED
- L2_NeMo_2_GPT_CLoRA_TP1PP1_MBS1_PACKED
- L2_NeMo_2_Mixtral_LoRA_EP2PP1_MBS2
- L2_NeMo_2_Mixtral_LoRA_TP1PP1_MBS1
- L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
# - Optional_L2_NeMo_2_Mixtral_LoRA_TP2PP1_MBS1
- L2_NeMo_2_Mistral_LoRA_TP1PP1_MBS1
- L2_NeMo_2_Mistral_LoRA_TP2PP1_MBS1
- L2_NEMO_2_LoRA_MERGE
Expand Down
5 changes: 0 additions & 5 deletions .github/workflows/secrets-detector.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4
with:
# setup repository and ref for PRs, see
# https://github.com/EndBug/add-and-commit?tab=readme-ov-file#working-with-prs
repository: ${{ github.event.pull_request.head.repo.full_name }}
ref: ${{ github.event.pull_request.head.ref }}
# custom token is required to trigger actions after reformatting + pushing
fetch-depth: 0
token: ${{ secrets.NEMO_REFORMAT_TOKEN }}

Expand Down
12 changes: 6 additions & 6 deletions Dockerfile.ci
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ RUN pip install nemo_run@git+https://github.com/NVIDIA/NeMo-Run.git@${NEMO_RUN_T
# Install NeMo requirements
ARG TE_TAG=7d576ed25266a17a7b651f2c12e8498f67e0baea
ARG MODELOPT_VERSION=0.19.0
ARG MCORE_TAG=c1728c12f1f1cdbb786e52f1ffe512295d76bef3
ARG MCORE_TAG=2f67f35b4a9e5596034e93eddeed07009bcf630c

ARG APEX_TAG=810ffae374a2b9cb4b5c5e28eaeca7d7998fca0c
RUN \
--mount=type=bind,source=requirements,target=requirements \
--mount=type=bind,source=tools,target=tools \
--mount=type=bind,source=setup.py,target=setup.py \
--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
--mount=type=bind,source=requirements,target=requirements \
--mount=type=bind,source=tools,target=tools \
--mount=type=bind,source=setup.py,target=setup.py \
--mount=type=bind,source=nemo/package_info.py,target=nemo/package_info.py \
--mount=type=bind,source=nemo/__init__.py,target=nemo/__init__.py <<"EOF" bash -ex
pip install --no-cache-dir --no-build-isolation --extra-index-url https://pypi.nvidia.com \
"transformer-engine @ git+https://github.com/NVIDIA/TransformerEngine.git@${TE_TAG}" \
"megatron_core @ git+https://github.com/NVIDIA/Megatron-LM.git@${MCORE_TAG}" \
Expand Down
3 changes: 2 additions & 1 deletion examples/asr/conf/speech_multitask/fast-conformer_aed.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,12 @@ spl_tokens:
model:
sample_rate: 16000
label_smoothing: 0.0
context_len_for_AR_decoding: 5 # Length of input prompt tokens. For example, in Canary models, we use [BOS,src_lang,task,tgt_lang,pnc] and thus the length is 5
use_loss_mask_for_prompt: false
log_prediction: true # enables logging sample predictions in the output during training

# Important ! Set the prompt format to the class you need
prompt_format: ??? # Options supported: ["canary"]
prompt_defaults: null

model_defaults:
asr_enc_hidden: 1024
Expand Down
24 changes: 23 additions & 1 deletion nemo/collections/asr/data/audio_to_text_lhotse_prompted.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

import torch.utils.data
from lhotse import CutSet
from lhotse.cut import MixedCut
from lhotse.dataset import AudioSamples
from lhotse.dataset.collation import collate_vectors

Expand Down Expand Up @@ -99,7 +100,7 @@ def __getitem__(self, cuts: CutSet) -> PromptedAudioToTextMiniBatch:
prompt_lens=prompt_lens,
prompted_transcript=prompts_with_answers,
prompted_transcript_lens=prompts_with_answers_lens,
cuts=cuts.drop_in_memory_data(),
cuts=_drop_in_memory_data(cuts),
)

def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple[torch.Tensor, torch.Tensor]:
Expand All @@ -111,3 +112,24 @@ def _collate_tokens(self, tokens: list[Union[list[int], torch.Tensor]]) -> tuple

class ProbablyIncorrectLanguageKeyError(RuntimeError):
pass


def _drop_in_memory_data(
cuts: CutSet,
_fields=frozenset(MixedCut.__dataclass_fields__.keys()),
) -> CutSet:
"""Workaround for an edge case in cuts.drop_in_memory_data() on MixedCut with Lhotse<1.29.0"""
ans = []
for c in cuts:
# Not a mixed cut or a mixed cut that wasn't assigned any extra attributes.
if not isinstance(c, MixedCut) or _fields.issuperset(c.__dict__.keys()):
ans.append(c.drop_in_memory_data())
else:
extra_attrs = {k: v for k, v in c.__dict__.items() if k not in _fields}
for k in extra_attrs:
delattr(c, k)
ans.append(c.drop_in_memory_data())
for k, v in extra_attrs.items():
setattr(ans[-1], k, v)
setattr(c, k, v)
return CutSet(ans)
50 changes: 36 additions & 14 deletions nemo/collections/asr/models/aed_multitask_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,8 @@ def lens_to_mask(lens, max_length):
Create a mask from a tensor of lengths.
"""
batch_size = lens.shape[0]
mask = torch.arange(max_length).repeat(batch_size, 1).to(lens.device) < lens[:, None]
arange = torch.arange(max_length, device=lens.device)
mask = arange.expand(batch_size, max_length) < lens.unsqueeze(1)
return mask


Expand Down Expand Up @@ -697,24 +698,34 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):
return torch.tensor([0.0])

input_ids, labels = batch.get_decoder_inputs_outputs()
input_ids_lens = batch.prompted_transcript_lens - 1

num_frames = batch.audio_lens.sum().float()
num_tokens = batch.prompted_transcript_lens.sum().float()
tot_frames = torch.as_tensor(batch.audio.numel(), device=num_frames.device, dtype=torch.float)
tot_tokens = torch.as_tensor(batch.prompted_transcript.numel(), device=num_frames.device, dtype=torch.float)

transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
input_signal=batch.audio,
input_signal_length=batch.audio_lens,
transcript=input_ids,
transcript_length=batch.prompted_transcript_lens,
transcript_length=input_ids_lens,
)

audio_loss = self.loss(log_probs=transf_log_probs, labels=labels)
# Mask components: 1) discard padding & 2) discard prompt (notice the negation)
# For a full decoder sequence O with len M, the loss mask skips the first element,
# covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
if self.cfg.get("use_loss_mask_for_prompt", False):
maxlen = batch.prompted_transcript.shape[1] - 1
loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
else:
loss_mask = None
audio_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)

num_frames = batch.audio_lens.sum()
num_tokens = batch.prompted_transcript_lens.sum()
tot_frames = batch.audio.numel()
tot_tokens = batch.prompted_transcript.numel()
tensorboard_logs = {
'train_loss': audio_loss,
'learning_rate': self._optimizer.param_groups[0]['lr'],
'batch_size': batch.audio.shape[0],
'learning_rate': torch.as_tensor(self._optimizer.param_groups[0]['lr']),
'batch_size': torch.as_tensor(batch.audio.shape[0]),
'num_frames': num_frames,
'num_tokens': num_tokens,
'input_to_padding_ratio': num_frames / tot_frames,
Expand All @@ -725,6 +736,7 @@ def training_step(self, batch: PromptedAudioToTextMiniBatch, batch_nb):

def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, dataloader_idx=0, eval_mode="val"):
input_ids, labels = batch.get_decoder_inputs_outputs()
input_ids_lens = batch.prompted_transcript_lens - 1

transf_log_probs, encoded_len, enc_states, enc_mask = self.forward(
input_signal=batch.audio,
Expand All @@ -733,11 +745,19 @@ def validation_pass(self, batch: PromptedAudioToTextMiniBatch, batch_idx, datalo
transcript_length=batch.prompted_transcript_lens,
)

transf_loss = self.loss(log_probs=transf_log_probs, labels=labels)
self.val_loss(loss=transf_loss, num_measurements=transf_log_probs.shape[0] * transf_log_probs.shape[1])
output_dict = {
f'{eval_mode}_loss': transf_loss,
}
# Mask components: 1) discard padding & 2) discard prompt (notice the negation)
# For a full decoder sequence O with len M, the loss mask skips the first element,
# covering the remaining M-1 elements - hence we subtract 1 from prompt lens to account BOS.
if self.cfg.get("use_loss_mask_for_prompt", False):
maxlen = batch.prompted_transcript.shape[1] - 1
loss_mask = lens_to_mask(input_ids_lens, maxlen) & ~lens_to_mask(batch.prompt_lens - 1, maxlen)
num_measurements = loss_mask.long().sum()
else:
loss_mask = None
num_measurements = transf_log_probs.shape[0] * transf_log_probs.shape[1]
transf_loss = self.loss(log_probs=transf_log_probs, labels=labels, output_mask=loss_mask)
self.val_loss(loss=transf_loss, num_measurements=num_measurements)
output_dict = {f'{eval_mode}_loss': transf_loss}

self.wer.update(
predictions=enc_states,
Expand Down Expand Up @@ -983,6 +1003,8 @@ def _setup_transcribe_dataloader(self, config: Dict) -> 'torch.utils.data.DataLo
'text_field': config.get('text_field', 'answer'),
'lang_field': config.get('lang_field', 'target_lang'),
'channel_selector': config.get('channel_selector', None),
'pad_min_duration': config.get('pad_min_duration', 1.0),
'pad_direction': config.get('pad_direction', 'both'),
}

temporary_datalayer = self._setup_dataloader_from_config(config=DictConfig(dl_config))
Expand Down
20 changes: 0 additions & 20 deletions nemo/collections/asr/modules/transformer/transformer_modules.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,27 +58,7 @@ def _build_pos_enc(self, hidden_size, max_sequence_length, device=None):
self.register_buffer('pos_enc', pos_enc)

def forward(self, position_ids):
max_pos_id = position_ids.max()
# update positional encoding if needed
if max_pos_id >= self._max_sequence_length:
logging.warning(
f'Max position id {max_pos_id} is greater than max sequence length {self._max_sequence_length}. Expanding position embeddings just for this batch. This is not expected to work very well. Consider chunking your input into smaller sequences.'
)
self._build_pos_enc(
hidden_size=self._hidden_size,
max_sequence_length=max_pos_id + 1,
device=position_ids.device,
)

embeddings = torch.embedding(self.pos_enc, position_ids)

# Revert expansion of position embeddings since this wall checkpoint size mismatches.
if max_pos_id >= self._max_sequence_length:
self._build_pos_enc(
hidden_size=self._hidden_size,
max_sequence_length=self._max_sequence_length,
device=position_ids.device,
)
return embeddings


Expand Down
6 changes: 6 additions & 0 deletions nemo/collections/common/data/lhotse/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,9 @@ class LhotseDataLoadingConfig:
rir_enabled: bool = False
rir_path: str | None = None # str, must point to a lhotse RecordingSet manifest
rir_prob: float = 0.5
# f. Padding to a minimum duration. Examples shorter than this will be padded, others are unaffected.
pad_min_duration: Optional[float] = None
pad_direction: str = "right" # "right" | "left" | "both" | "random"

# 5. Other Lhotse options.
text_field: str = "text" # key to read the transcript from
Expand Down Expand Up @@ -278,6 +281,9 @@ def get_lhotse_dataloader_from_config(
keep_excessive_supervisions=config.keep_excessive_supervisions,
)

if config.pad_min_duration is not None:
cuts = cuts.pad(duration=config.pad_min_duration, direction=config.pad_direction, preserve_id=True)

# Duration filtering, same as native NeMo dataloaders.
# We can filter after the augmentations because they are applied only when calling load_audio().
cuts = cuts.filter(DurationFilter(config.min_duration, config.max_duration))
Expand Down
1 change: 1 addition & 0 deletions nemo/collections/common/prompts/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from nemo.collections.common.prompts.canary import CanaryPromptFormatter
from nemo.collections.common.prompts.canary2 import Canary2PromptFormatter
from nemo.collections.common.prompts.fn import get_prompt_format_fn, registered_prompt_format_fn
from nemo.collections.common.prompts.formatter import PromptFormatter
from nemo.collections.common.prompts.gemma import GemmaPromptFormatter
Expand Down
Loading

0 comments on commit 5a94ed2

Please sign in to comment.