Our repository is built on top of Galore, MARS and AlphaDecay. You can configure the environment using the following command lines:
conda create -n htmuon python=3.9 -y
conda activate htmuon
conda install -r requirementsWe utilized the publicly available C4 dataset, OpenWebText dataset, CIFAR-10, CIFAR-100 and ImageNet-1K dataset, all of which can be accessed and downloaded from their respective official websites.
export WANDB_API_KEY='Your_WandB_API_Key_Here'
export CUDA_VISIBLE_DEVICES=0,1
torchrun --nproc_per_node=2 --master_port=20119 --master_addr=localhost torchrun_main_HTMuon.py \
--model_config configs/llama_60m.json \
--optimizer htmuon \ #or acceleration use "htmuon_ns" / "htmuon_interval" (with interval=5)
--seed 5 \
--lr 0.001 \
--lrmuon 0.03\
--power 0.125 \
--batch_size 256 \
--total_batch_size 512 \
--num_training_steps 10000 \
--warmup_steps 1000 \
--weight_decay 0.1\
--dtype bfloat16 \
--eval_every 1000 \
--wandb_name 'Your_WandB_Name_Here' \
--target_eval_tokens 10_000_000 \
--save_every 10000export WANDB_API_KEY='Your_WandB_API_Key_Here'
export CUDA_VISIBLE_DEVICES=0,1,2,3
torchrun --nproc_per_node=4 --master_port=20119 --master_addr=localhost torchrun_main_HTMuon.py \
--model_config configs/llama_135m.json
--optimizer htmuon \ #or acceleration use "htmuon_ns" / "htmuon_interval" (with interval=5)
--seed 5 \
--lr 0.001 \
--lrmuon 5e-3\
--batch_size 128 \
--total_batch_size 512 \
--num_training_steps 20000 \
--warmup_steps 2000 \
--weight_decay 0.1\
--dtype bfloat16 \
--eval_every 1000 \
--wandb_name 'Your_WandB_Name_Here' \
--target_eval_tokens 10_000_000 \
--save_every 10000export WANDB_API_KEY='Your_WandB_API_Key_Here'
export CUDA_VISIBLE_DEVICES=0,1,2,3
torchrun --nproc_per_node=4 --master_port=20119 --master_addr=localhost torchrun_main_HTMuon.py \
--model_config configs/llama_350m.json \
--optimizer htmuon \
--seed 5 \
--lr 0.001 \
--lrmuon 5e-3\
--power 0.125 \
--batch_size 128 \
--total_batch_size 512 \
--num_training_steps 60000 \
--warmup_steps 6000 \
--weight_decay 0.1\
--dtype bfloat16 \
--eval_every 1000 \
--wandb_name 'Your_WandB_Name_Here' \
--target_eval_tokens 10_000_000 \
--save_every 10000export WANDB_API_KEY='Your_WandB_API_Key_Here'
export CUDA_VISIBLE_DEVICES=0,1,2,3
torchrun --nproc_per_node=4 --master_port=20119 --master_addr=localhost torchrun_main_HTMuon.py \
--model_config configs/llama_1b.json \
--optimizer htmuon_interval \
--seed 5 \
--lr 0.001 \
--lrmuon 5e-3\
--power 0.125 \
--interval 5 \
--batch_size 128 \
--total_batch_size 512 \
--num_training_steps 90000 \
--warmup_steps 9000 \
--weight_decay 0.1\
--dtype bfloat16 \
--eval_every 1000 \
--wandb_name 'Your_WandB_Name_Here' \
--target_eval_tokens 10_000_000 \
--save_every 10000This repository is build upon the Galore,MARS and AlphaDecay repositories. Thanks for their great work!