tsqn commited on
Commit
ec8afe0
·
1 Parent(s): 3ef2a93

update README.md

Browse files
Files changed (1) hide show
  1. README.md +260 -79
README.md CHANGED
@@ -61,90 +61,271 @@ vae/
61
  `pip install git+https://github.com/huggingface/diffusers`
62
 
63
 
64
- ```python
65
- import torch
66
- from diffusers import ZImagePipeline
67
-
68
- # 1. Load the pipeline
69
- # Use bfloat16 for optimal performance on supported GPUs
70
- pipe = ZImagePipeline.from_pretrained(
71
- "path/to/model_files_main_dir",
72
- torch_dtype=torch.float32, # or torch.bfloat16 / torch.float16
73
- low_cpu_mem_usage=False,
74
- )
75
- pipe.to("cuda")
76
-
77
- # [Optional] Attention Backend
78
- # Diffusers uses SDPA by default. Switch to Flash Attention for better efficiency if supported:
79
- # pipe.transformer.set_attention_backend("flash") # Enable Flash-Attention-2
80
- # pipe.transformer.set_attention_backend("_flash_3") # Enable Flash-Attention-3
81
-
82
- # [Optional] Model Compilation
83
- # Compiling the DiT model accelerates inference, but the first run will take longer to compile.
84
- # pipe.transformer.compile()
85
-
86
- # [Optional] CPU Offloading
87
- # Enable CPU offloading for memory-constrained devices.
88
- # pipe.enable_model_cpu_offload()
89
-
90
- prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
91
-
92
- # 2. Generate Image
93
- image = pipe(
94
- prompt=prompt,
95
- height=1024,
96
- width=1024,
97
- num_inference_steps=9, # This actually results in 8 DiT forwards
98
- guidance_scale=0.0, # Guidance should be 0 for the Turbo models
99
- generator=torch.Generator("cuda").manual_seed(42),
100
- ).images[0]
101
-
102
- image.save("example.png")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
 
 
 
104
  ```
105
 
106
  #### Example 2
107
 
108
- ```py
109
- import torch
110
- from diffusers import ZImagePipeline, ZImageTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
111
- from transformers import Qwen3Model, Qwen2Tokenizer
112
-
113
-
114
- MODEL_PATH = "tsqn/Z-Image-Turbo_fp32-fp16-bf16_full_and_ema-only"
115
-
116
- vae = AutoencoderKL.from_pretrained(MODEL_PATH, subfolder="vae", torch_dtype=torch.bfloat16)
117
- text_encoder = Qwen3Model.from_pretrained(MODEL_PATH, subfolder="text_encoder", torch_dtype=torch.bfloat16)
118
- tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer")
119
- transformer = ZImageTransformer2DModel.from_pretrained(MODEL_PATH, subfolder="transformer", torch_dtype=torch.float32)
120
-
121
- pipe = ZImagePipeline.from_pretrained(
122
- MODEL_PATH,
123
- vae=vae,
124
- text_encoder=text_encoder,
125
- tokenizer=tokenizer,
126
- transformer=transformer,
127
- torch_dtype=torch.float32,
128
- low_cpu_mem_usage=False,
129
- )
130
- pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config)
131
- pipe.enable_model_cpu_offload()
132
-
133
- prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
134
-
135
- with torch.inference_mode():
136
- image = pipe(
137
- prompt=prompt,
138
- height=1024,
139
- width=1024,
140
- num_inference_steps=9,
141
- guidance_scale=0.0,
142
- generator=torch.Generator("cuda").manual_seed(42),
143
- ).images[0]
144
-
145
- image.save("example.png")
146
- torch.cuda.empty_cache()
147
- ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
 
150
  ## 🎯 Recommendations
 
61
  `pip install git+https://github.com/huggingface/diffusers`
62
 
63
 
64
+ #### Example /w repo+models downloader
65
+
66
+ <details>
67
+ <summary>download_repo.py</summary>
68
+
69
+ ```py
70
+ import argparse
71
+ import os
72
+ from huggingface_hub import snapshot_download, hf_hub_download
73
+
74
+
75
+ REPO_ID = "tsqn/Z-Image-Turbo_fp32-fp16-bf16_full_and_ema-only"
76
+
77
+ def main(local_dir):
78
+ TRANSFORMER_DIR = f"{local_dir}\\transformer"
79
+ TEXT_ENCODER_DIR = f"{local_dir}\\text_encoder"
80
+ VAE_DIR = f"{local_dir}\\vae"
81
+
82
+ def _download_model_files():
83
+ snapshot_download(repo_id=REPO_ID, ignore_patterns="*.safetensors", local_dir=local_dir)
84
+ hf_hub_download(repo_id=REPO_ID, filename="diffusion_pytorch_model-ema-only-fp32.safetensors", local_dir=local_dir)
85
+ hf_hub_download(repo_id=REPO_ID, subfolder="text_encoder", filename="qwen_3_4b_bf16.safetensors", local_dir=local_dir)
86
+ hf_hub_download(repo_id=REPO_ID, subfolder="vae", filename="ae_bf16.safetensors", local_dir=local_dir)
87
+
88
+ def _rename_model_files():
89
+ os.rename(f"{TRANSFORMER_DIR}\\diffusion_pytorch_model-ema-only-fp32.safetensors", f"{TRANSFORMER_DIR}\\diffusion_pytorch_model.safetensors")
90
+ os.rename(f"{TEXT_ENCODER_DIR}\\qwen_3_4b_bf16.safetensors", f"{TEXT_ENCODER_DIR}\\model.safetensors")
91
+ os.rename(f"{VAE_DIR}\\ae_bf16.safetensors", f"{VAE_DIR}\\diffusion_pytorch_model.safetensors")
92
+
93
+ try:
94
+ _download_model_files()
95
+ except:
96
+ print("ERROR when downloading model files!")
97
+ raise Exception
98
+
99
+ try:
100
+ _rename_model_files()
101
+ except:
102
+ print("ERROR when renaming model files!")
103
+ raise Exception
104
+
105
+ if __name__ == "__main__":
106
+ parser = argparse.ArgumentParser()
107
+
108
+ parser.add_argument(
109
+ "--local_dir", default=None, type=str, required=True, help="Whether to save repository with model files"
110
+ )
111
+
112
+ args = parser.parse_args()
113
+ main(args.local_dir)
114
+
115
+ ```
116
+
117
+ </details>
118
+
119
+
120
+ <details>
121
+ <summary>generate_image.py</summary>
122
+
123
+ ```py
124
+ import argparse
125
+ import torch
126
+ from diffusers import ZImagePipeline, ZImageTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
127
+ from transformers import Qwen3Model, Qwen2Tokenizer
128
+
129
+
130
+ def setup_zimage_pipeline(model_path, model_cpu_offload=True):
131
+ def _setup_pipeline_components():
132
+ vae = AutoencoderKL.from_pretrained(model_path, subfolder="vae", torch_dtype=torch.bfloat16)
133
+ text_encoder = Qwen3Model.from_pretrained(model_path, subfolder="text_encoder", torch_dtype=torch.bfloat16)
134
+ tokenizer = Qwen2Tokenizer.from_pretrained(model_path, subfolder="tokenizer")
135
+ transformer = ZImageTransformer2DModel.from_pretrained(model_path, subfolder="transformer", torch_dtype=torch.float32)
136
+ return {
137
+ "vae": vae,
138
+ "text_encoder": text_encoder,
139
+ "tokenizer": tokenizer,
140
+ "transformer": transformer
141
+ }
142
+ pipeline = ZImagePipeline.from_pretrained(
143
+ model_path,
144
+ torch_dtype=torch.float32,
145
+ low_cpu_mem_usage=False,
146
+ **_setup_pipeline_components()
147
+ )
148
+ pipeline.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipeline.scheduler.config)
149
+
150
+ if model_cpu_offload:
151
+ pipeline.enable_model_cpu_offload()
152
+ return pipeline
153
+
154
+ def generate_image(pipe, prompt, height, width, num_inference_steps, guidance_scale, seed, output_save_path):
155
+ with torch.inference_mode():
156
+ image = pipe(
157
+ prompt=prompt,
158
+ height=height,
159
+ width=width,
160
+ num_inference_steps=num_inference_steps,
161
+ guidance_scale=guidance_scale,
162
+ generator=torch.Generator("cuda").manual_seed(seed),
163
+ ).images[0]
164
+
165
+ if output_save_path:
166
+ image.save(f"{output_save_path}\\example_{seed}.png")
167
+ else:
168
+ image.save(f"example_{seed}.png")
169
+ torch.cuda.empty_cache()
170
+
171
+ def main(args):
172
+ pipeline = setup_zimage_pipeline(args.local_dir)
173
+ generate_image(
174
+ pipe=pipeline,
175
+ prompt=args.prompt,
176
+ width=args.width,
177
+ height=args.height,
178
+ num_inference_steps=args.num_inference_steps,
179
+ guidance_scale=args.guidance_scale,
180
+ seed=args.seed,
181
+ output_save_path=args.output_save_path
182
+ )
183
+
184
+ if __name__ == "__main__":
185
+ parser = argparse.ArgumentParser()
186
+
187
+ parser.add_argument(
188
+ "--local_dir", default=None, type=str, required=True, help="Path to the zimage diffusers local repository"
189
+ )
190
+ parser.add_argument(
191
+ "--prompt", default="Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights.", type=str, required=False, help="Prompt used to generate image."
192
+ )
193
+ parser.add_argument(
194
+ "--width", default=1024, type=int, required=False, help="Width of the generated image."
195
+ )
196
+ parser.add_argument(
197
+ "--height", default=1024, type=int, required=False, help="Height of the generated image."
198
+ )
199
+ parser.add_argument(
200
+ "--num_inference_steps", default=9, type=int, required=False, help="Number of the inference steps."
201
+ )
202
+ parser.add_argument(
203
+ "--guidance_scale", default=0.0, type=float, required=False, help="Guidance scale setting."
204
+ )
205
+ parser.add_argument(
206
+ "--seed", default=42, type=int, required=False, help="Random seed value."
207
+ )
208
+ parser.add_argument(
209
+ "--output_save_path",
210
+ default=None,
211
+ type=str,
212
+ required=False,
213
+ help="Path to the directory for generated image.",
214
+ )
215
+
216
+ args = parser.parse_args()
217
+ main(args)
218
+
219
+ ```
220
+
221
+ </details>
222
+
223
+ ```shell
224
+ python download_repo.py --local_dir "C:\\zimage"
225
+ python generate_image.py --local_dir "C:\\zimage"
226
+ ```
227
 
228
+ with custom parameters:
229
+ ```shell
230
+ python generate_image.py --local_dir "C:\\zimage" --prompt "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights." --width 1024 --height 1042 --num_inference_steps 9 --guidance_scale 0.0 --seed 42 --output_save_path "C:\\zimage_generations"
231
  ```
232
 
233
  #### Example 2
234
 
235
+ <details>
236
+ <summary>Code</summary>
237
+
238
+ ```python
239
+ import torch
240
+ from diffusers import ZImagePipeline
241
+
242
+ # 1. Load the pipeline
243
+ # Use bfloat16 for optimal performance on supported GPUs
244
+ pipe = ZImagePipeline.from_pretrained(
245
+ "path/to/model_files_main_dir",
246
+ torch_dtype=torch.float32, # or torch.bfloat16 / torch.float16
247
+ low_cpu_mem_usage=False,
248
+ )
249
+ pipe.to("cuda")
250
+
251
+ # [Optional] Attention Backend
252
+ # Diffusers uses SDPA by default. Switch to Flash Attention for better efficiency if supported:
253
+ # pipe.transformer.set_attention_backend("flash") # Enable Flash-Attention-2
254
+ # pipe.transformer.set_attention_backend("_flash_3") # Enable Flash-Attention-3
255
+
256
+ # [Optional] Model Compilation
257
+ # Compiling the DiT model accelerates inference, but the first run will take longer to compile.
258
+ # pipe.transformer.compile()
259
+
260
+ # [Optional] CPU Offloading
261
+ # Enable CPU offloading for memory-constrained devices.
262
+ # pipe.enable_model_cpu_offload()
263
+
264
+ prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
265
+
266
+ # 2. Generate Image
267
+ image = pipe(
268
+ prompt=prompt,
269
+ height=1024,
270
+ width=1024,
271
+ num_inference_steps=9, # This actually results in 8 DiT forwards
272
+ guidance_scale=0.0, # Guidance should be 0 for the Turbo models
273
+ generator=torch.Generator("cuda").manual_seed(42),
274
+ ).images[0]
275
+
276
+ image.save("example.png")
277
+
278
+ ```
279
+
280
+ </details>
281
+
282
+ #### Example 3
283
+
284
+ <details>
285
+ <summary>Code</summary>
286
+
287
+ ```py
288
+ import torch
289
+ from diffusers import ZImagePipeline, ZImageTransformer2DModel, AutoencoderKL, FlowMatchEulerDiscreteScheduler
290
+ from transformers import Qwen3Model, Qwen2Tokenizer
291
+
292
+
293
+ MODEL_PATH = "tsqn/Z-Image-Turbo_fp32-fp16-bf16_full_and_ema-only"
294
+
295
+ vae = AutoencoderKL.from_pretrained(MODEL_PATH, subfolder="vae", torch_dtype=torch.bfloat16)
296
+ text_encoder = Qwen3Model.from_pretrained(MODEL_PATH, subfolder="text_encoder", torch_dtype=torch.bfloat16)
297
+ tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_PATH, subfolder="tokenizer")
298
+ transformer = ZImageTransformer2DModel.from_pretrained(MODEL_PATH, subfolder="transformer", torch_dtype=torch.float32)
299
+
300
+ pipe = ZImagePipeline.from_pretrained(
301
+ MODEL_PATH,
302
+ vae=vae,
303
+ text_encoder=text_encoder,
304
+ tokenizer=tokenizer,
305
+ transformer=transformer,
306
+ torch_dtype=torch.float32,
307
+ low_cpu_mem_usage=False,
308
+ )
309
+ pipe.scheduler = FlowMatchEulerDiscreteScheduler.from_config(pipe.scheduler.config)
310
+ pipe.enable_model_cpu_offload()
311
+
312
+ prompt = "Young Chinese woman in red Hanfu, intricate embroidery. Impeccable makeup, red floral forehead pattern. Elaborate high bun, golden phoenix headdress, red flowers, beads. Holds round folding fan with lady, trees, bird. Neon lightning-bolt lamp (⚡️), bright yellow glow, above extended left palm. Soft-lit outdoor night background, silhouetted tiered pagoda (西安大雁塔), blurred colorful distant lights."
313
+
314
+ with torch.inference_mode():
315
+ image = pipe(
316
+ prompt=prompt,
317
+ height=1024,
318
+ width=1024,
319
+ num_inference_steps=9,
320
+ guidance_scale=0.0,
321
+ generator=torch.Generator("cuda").manual_seed(42),
322
+ ).images[0]
323
+
324
+ image.save("example.png")
325
+ torch.cuda.empty_cache()
326
+ ```
327
+
328
+ </details>
329
 
330
 
331
  ## 🎯 Recommendations