import sys
sys.path.append("./")

# import torch
# from torchvision import transforms
from meissonic.transformer import Transformer2DModel as TransformerMeissonic
from meissonic.pipeline import Pipeline as PipelineMeissonic
from meissonic.scheduler import Scheduler as MeissonicScheduler
from transformers import CLIPTextModelWithProjection, CLIPTokenizer
from diffusers import VQModel

device = 'cuda'
model_path = 'MeissonFlow/Meissonic'
cache_dir = '/mnt/models/Diffusers'

# diffusers_load_config['variant'] = fp16

model = TransformerMeissonic.from_pretrained(model_path, subfolder="transformer", cache_dir=cache_dir)
vq_model = VQModel.from_pretrained(model_path, subfolder="vqvae", cache_dir=cache_dir)
# text_encoder = CLIPTextModelWithProjection.from_pretrained(model_path,subfolder="text_encoder",)
text_encoder = CLIPTextModelWithProjection.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K", cache_dir=cache_dir)
tokenizer = CLIPTokenizer.from_pretrained(model_path, subfolder="tokenizer")
scheduler = MeissonicScheduler.from_pretrained(model_path, subfolder="scheduler")
pipe = PipelineMeissonic(vq_model, tokenizer=tokenizer, text_encoder=text_encoder, transformer=model, scheduler=scheduler)
pipe = pipe.to(device)

steps = 64
guidance_scale = 9
resolution = 1024 
negative = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark"
prompt = "Beautiful young woman posing on a lake with snow covered mountains in the background"
image = pipe(prompt=prompt, negative_prompt=negative, height=resolution, width=resolution, guidance_scale=guidance_scale, num_inference_steps=steps).images[0]
image.save('/tmp/meissonic.png')