import sys sys.path.append("./") # import torch # from torchvision import transforms from meissonic.transformer import Transformer2DModel as TransformerMeissonic from meissonic.pipeline import MeissonicPipeline from meissonic.scheduler import Scheduler as MeissonicScheduler from transformers import CLIPTextModelWithProjection, CLIPTokenizer from diffusers import VQModel device = 'cuda' model_path = 'MeissonFlow/Meissonic' cache_dir = '/mnt/models/Diffusers' # diffusers_load_config['variant'] = fp16 model = TransformerMeissonic.from_pretrained(model_path, subfolder="transformer", cache_dir=cache_dir) vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", cache_dir=cache_dir) # text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path,subfolder="text_encoder",) text_encoder = CLIPTextModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K", cache_dir=cache_dir) tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer") scheduler = MeissonicScheduler.from_pretrained(model_path, subfolder="scheduler") pipe = MeissonicPipeline(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler) pipe = pipe.to(device) steps = 64 guidance_scale = 9 resolution = 1024 negative = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark" prompt = "Beautiful young woman posing on a lake with snow covered mountains in the background" image = pipe(prompt=prompt, negative_prompt=negative, height=resolution, width=resolution, guidance_scale=guidance_scale, num_inference_steps=steps).images[0] image.save('/tmp/meissonic.png')