| name: comma_v0p1_yolooooo |
| dump_dir: /fsx/craffel/lingua_logs/comma_v0p1/ |
| seed: 777 |
| grad_acc_steps: 4 |
| gc_collect_freq: 1000 |
| probe_freq: null |
| steps: 500000 |
| data: |
| root_dir: /scratch/craffel/lingua/data/ |
| sources: |
| peS2o: 0.274065475510351 |
| stackexchange: 0.134617935796937 |
| stackv2_edu: 0.127770669195666 |
| cccc: 0.0871992270000557 |
| wikimedia: 0.0861800315862719 |
| github_archive: 0.0606452345122248 |
| uspto: 0.0413469377516883 |
| pubmed: 0.0367902799837971 |
| arxiv_papers: 0.0292395449667613 |
| caselaw_access_project: 0.0193875362722656 |
| wikiteam: 0.0137485410839637 |
| doab: 0.0180439781895451 |
| uk_hansard: 0.0144498535570883 |
| pre_1929_books: 0.0115755547988338 |
| ubuntu_irc: 0.00794254267719456 |
| regulations: 0.00762583706405442 |
| data_provenance_initiative: 0.00512264496834867 |
| project_gutenberg: 0.00502100654070129 |
| youtube: 0.00465917165839394 |
| arxiv_abstracts: 0.00359635066160403 |
| stackv2_html: 0.00225924255952781 |
| usgpo: 0.00226024581728848 |
| library_of_congress: 0.00222469340783564 |
| biodiversity_heritage_library: 0.00221737524370278 |
| pressbooks: 0.000865101033213598 |
| libretexts: 0.00054149556727006 |
| news: 0.000372716196818104 |
| foodista: 0.000125363443065615 |
| oercommons: 7.78696843693821e-05 |
| python_enhancement_proposals: 1.69983991984805e-05 |
| public_domain_review: 1.05448719635173e-05 |
| batch_size: 2 |
| seq_len: 4096 |
| n_views: 2 |
| seed: 42 |
| add_bos: true |
| add_eos: true |
| load_async: true |
| prefetch_size: 4096 |
| tokenizer: |
| name: tiktoken |
| path: /fsx/craffel/lingua/tokenizers/common-pile-tokenizer.tiktoken |
| optim: |
| lr: 0.001 |
| weight_decay: 0.2 |
| epsilon: 1.0e-08 |
| beta1: 0.9 |
| beta2: 0.95 |
| clip: 1.0 |
| scheduler: cosine |
| warmup: 2000 |
| lr_min_ratio: 1.0e-06 |
| cycle_length: 1.0 |
| cosine_theta: 1.0 |
| annealing_step: 1000 |
| decay_fraction: 0.1 |
| exp_factor: 0.5 |
| model: |
| dim: 4096 |
| n_layers: 32 |
| head_dim: null |
| n_heads: 32 |
| n_kv_heads: null |
| ffn_dim_multiplier: 1.0 |
| multiple_of: 256 |
| norm_eps: 1.0e-05 |
| rope_theta: 100000.0 |
| init_base_std: null |
| init_std_factor: disabled |
| max_seqlen: 4096 |
| seed: 42 |
| vocab_size: 64256 |
| weight_tying: false |
| sliding_window: null |
| distributed: |
| dp_shard: 1 |
| dp_replicate: 64 |
| tp_size: 1 |
| selective_activation_checkpointing: false |
| compile: true |
| fsdp_type: full_shard |
| model_dtype: bf16 |
| float8_recipe: null |
| float8_filter: layers\.[0-9]+\. |
| matmul_allow_tf32: false |
| detect_anomaly: false |
| compile_cache_size_limit: 8 |
| spawn_method: forkserver |
| env: |
| MKL_SERVICE_FORCE_INTEL: GNU |
| OMP_NUM_THREADS: '1' |
| MKL_NUM_THREADS: '1' |
| ENABLE_INTRA_NODE_COMM: '1' |
| TORCH_NCCL_AVOID_RECORD_STREAMS: '1' |
| NCCL_IB_TIMEOUT: '22' |
| NCCL_DEBUG: INFO |
| TORCH_NCCL_ASYNC_ERROR_HANDLING: '1' |
| checkpoint: |
| dump: |
| every: 10000 |
| keep: -1 |
| eval: |
| every: 2000 |
| keep: 3 |
| path: /fsx/craffel/lingua_logs/comma_v0p1/checkpoints |
| init_ckpt_path: null |
| continue_training_from_init: false |
| profiling: |
| run: true |
| trace_folder: profiling |
| mem_warmup: 0 |
| mem_steps: 4 |
| profile_warmup: 100 |
| profile_steps: 4 |
| logging: |
| freq: 1 |
| acc_freq: null |
| wandb: null |
| async_eval_gpus: 8 |
| eval: |
| harness: |
| tasks: |
| - hellaswag |
| - task: boolq |
| dataset_kwargs: |
| trust_remote_code: true |
| - piqa |
| - task: social_iqa |
| dataset_kwargs: |
| trust_remote_code: true |
| - winogrande |
| - openbookqa |
| - arc_easy |
| - arc_challenge |
| - race |
| - commonsense_qa |
| - task: copa |
| dataset_kwargs: |
| trust_remote_code: true |
| - mmlu |
| - mmlu_pro |
| generator: |
| max_tokens: 8192 |
| dtype: bf16 |
|
|