| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import omegaconf |
|
|
| from .pretrained_vae import JITVAE, JointImageVideoSharedJITTokenizer, VideoJITTokenizer |
| from .lazy_config_init import LazyCall as L |
|
|
| TOKENIZER_OPTIONS = {} |
|
|
|
|
| def tokenizer_register(key): |
| def decorator(func): |
| TOKENIZER_OPTIONS[key] = func |
| return func |
|
|
| return decorator |
|
|
|
|
| @tokenizer_register("cosmos_diffusion_tokenizer_comp8x8x8") |
| def get_cosmos_diffusion_tokenizer_comp8x8x8(resolution: str, chunk_duration: int) -> omegaconf.dictconfig.DictConfig: |
| assert resolution in ["720"] |
|
|
| pixel_chunk_duration = chunk_duration |
| temporal_compression_factor = 8 |
| spatial_compression_factor = 8 |
|
|
| return L(JointImageVideoSharedJITTokenizer)( |
| video_vae=L(VideoJITTokenizer)( |
| name="cosmos_1_0_diffusion_tokenizer", |
| latent_ch=16, |
| is_bf16=True, |
| pixel_chunk_duration=pixel_chunk_duration, |
| temporal_compression_factor=temporal_compression_factor, |
| spatial_compression_factor=spatial_compression_factor, |
| spatial_resolution=resolution, |
| ), |
| image_vae=L(JITVAE)( |
| name="cosmos_1_0_diffusion_tokenizer", |
| latent_ch=16, |
| is_image=False, |
| is_bf16=True, |
| ), |
| name="cosmos_1_0_diffusion_tokenizer", |
| latent_ch=16, |
| ) |
|
|