mirror of https://github.com/vladmandic/automatic
parent
a5b77b8ee2
commit
8c676d0a61
|
|
@ -19,7 +19,7 @@ ci:
|
||||||
repos:
|
repos:
|
||||||
# Standard hooks
|
# Standard hooks
|
||||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||||
rev: v4.4.0
|
rev: v5.0.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: check-added-large-files
|
- id: check-added-large-files
|
||||||
- id: check-case-conflict
|
- id: check-case-conflict
|
||||||
|
|
@ -27,7 +27,6 @@ repos:
|
||||||
- id: check-symlinks
|
- id: check-symlinks
|
||||||
- id: check-yaml
|
- id: check-yaml
|
||||||
args: ["--allow-multiple-documents"]
|
args: ["--allow-multiple-documents"]
|
||||||
- id: debug-statements
|
|
||||||
- id: end-of-file-fixer
|
- id: end-of-file-fixer
|
||||||
- id: mixed-line-ending
|
- id: mixed-line-ending
|
||||||
- id: trailing-whitespace
|
- id: trailing-whitespace
|
||||||
|
|
@ -36,31 +35,3 @@ repos:
|
||||||
.*\.md|
|
.*\.md|
|
||||||
.github/ISSUE_TEMPLATE/.*\.yml
|
.github/ISSUE_TEMPLATE/.*\.yml
|
||||||
)$
|
)$
|
||||||
|
|
||||||
- repo: https://github.com/charliermarsh/ruff-pre-commit
|
|
||||||
rev: 'v0.0.285'
|
|
||||||
hooks:
|
|
||||||
- id: ruff
|
|
||||||
args: [--fix, --exit-non-zero-on-fix]
|
|
||||||
- repo: local
|
|
||||||
hooks:
|
|
||||||
- id: pylint
|
|
||||||
name: pylint
|
|
||||||
entry: pylint
|
|
||||||
language: system
|
|
||||||
types: [python]
|
|
||||||
args: []
|
|
||||||
|
|
||||||
# Black, the code formatter, natively supports pre-commit
|
|
||||||
# - repo: https://github.com/psf/black
|
|
||||||
# rev: 23.7.0
|
|
||||||
# hooks:
|
|
||||||
# - id: black
|
|
||||||
# exclude: ^(docs)
|
|
||||||
|
|
||||||
# Changes tabs to spaces
|
|
||||||
# - repo: https://github.com/Lucas-C/pre-commit-hooks
|
|
||||||
# rev: v1.5.3
|
|
||||||
# hooks:
|
|
||||||
# - id: remove-tabs
|
|
||||||
# exclude: ^(docs)
|
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ ignore-patterns=.*test*.py$,
|
||||||
.*_model.py$,
|
.*_model.py$,
|
||||||
.*_arch.py$,
|
.*_arch.py$,
|
||||||
.*_model_arch.py*,
|
.*_model_arch.py*,
|
||||||
.*_model_arch_v2.py$,
|
.*_model_arch_v2.py$,
|
||||||
ignored-modules=
|
ignored-modules=
|
||||||
jobs=0
|
jobs=0
|
||||||
limit-inference-results=100
|
limit-inference-results=100
|
||||||
|
|
|
||||||
|
|
@ -10,4 +10,4 @@
|
||||||
"python.analysis.typeCheckingMode": "off",
|
"python.analysis.typeCheckingMode": "off",
|
||||||
"editor.formatOnSave": false,
|
"editor.formatOnSave": false,
|
||||||
"python.REPL.enableREPLSmartSend": false
|
"python.REPL.enableREPLSmartSend": false
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -55,7 +55,7 @@
|
||||||
- major refactoring of requirements and dependencies to unblock `numpy>=2.1.0`
|
- major refactoring of requirements and dependencies to unblock `numpy>=2.1.0`
|
||||||
- patch `insightface`
|
- patch `insightface`
|
||||||
- stronger lint rules
|
- stronger lint rules
|
||||||
add separate `npm run lint`, `npm run todo`, `npm run test` macros
|
add separate `npm run lint`, `npm run todo`, `npm run test`, `npm run format` macros
|
||||||
|
|
||||||
## Update for 2025-06-30
|
## Update for 2025-06-30
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,7 @@ repository-code: 'https://github.com/vladmandic/sdnext'
|
||||||
abstract: >-
|
abstract: >-
|
||||||
SD.Next: Advanced Implementation of Stable Diffusion
|
SD.Next: Advanced Implementation of Stable Diffusion
|
||||||
and other diffusion models for text, image and video
|
and other diffusion models for text, image and video
|
||||||
generation
|
generation
|
||||||
keywords:
|
keywords:
|
||||||
- stablediffusion diffusers sdnext
|
- stablediffusion diffusers sdnext
|
||||||
license: Apache-2.0
|
license: Apache-2.0
|
||||||
|
|
|
||||||
|
|
@ -1,17 +1,17 @@
|
||||||
# Code of Conduct
|
# Code of Conduct
|
||||||
|
|
||||||
Use your best judgement
|
Use your best judgement
|
||||||
If it will possibly make others uncomfortable, do not post it
|
If it will possibly make others uncomfortable, do not post it
|
||||||
|
|
||||||
- Be respectful
|
- Be respectful
|
||||||
Disagreement is not an opportunity to attack someone else's thoughts or opinions
|
Disagreement is not an opportunity to attack someone else's thoughts or opinions
|
||||||
Although views may differ, remember to approach every situation with patience and care
|
Although views may differ, remember to approach every situation with patience and care
|
||||||
- Be considerate
|
- Be considerate
|
||||||
Think about how your contribution will affect others in the community
|
Think about how your contribution will affect others in the community
|
||||||
- Be open minded
|
- Be open minded
|
||||||
Embrace new people and new ideas. Our community is continually evolving and we welcome positive change
|
Embrace new people and new ideas. Our community is continually evolving and we welcome positive change
|
||||||
|
|
||||||
Be mindful of your language
|
Be mindful of your language
|
||||||
Any of the following behavior is unacceptable:
|
Any of the following behavior is unacceptable:
|
||||||
|
|
||||||
- Offensive comments of any kind
|
- Offensive comments of any kind
|
||||||
|
|
@ -20,7 +20,7 @@ Any of the following behavior is unacceptable:
|
||||||
|
|
||||||
If you believe someone is violating the code of conduct, we ask that you report it
|
If you believe someone is violating the code of conduct, we ask that you report it
|
||||||
|
|
||||||
Participants asked to stop any harassing behavior are expected to comply immediately
|
Participants asked to stop any harassing behavior are expected to comply immediately
|
||||||
|
|
||||||
<br>
|
<br>
|
||||||
|
|
||||||
|
|
|
||||||
40
CONTRIBUTING
40
CONTRIBUTING
|
|
@ -4,24 +4,24 @@ Pull requests from everyone are welcome
|
||||||
|
|
||||||
Procedure for contributing:
|
Procedure for contributing:
|
||||||
|
|
||||||
- Select SD.Next `dev` branch:
|
- Select SD.Next `dev` branch:
|
||||||
<https://github.com/vladmandic/sdnext/tree/dev>
|
<https://github.com/vladmandic/sdnext/tree/dev>
|
||||||
- Create a fork of the repository on github
|
- Create a fork of the repository on github
|
||||||
In a top right corner of a GitHub, select "Fork"
|
In a top right corner of a GitHub, select "Fork"
|
||||||
Its recommended to fork latest version from main branch to avoid any possible conflicting code updates
|
Its recommended to fork latest version from main branch to avoid any possible conflicting code updates
|
||||||
- Clone your forked repository to your local system
|
- Clone your forked repository to your local system
|
||||||
`git clone https://github.com/<your-username>/<your-fork>`
|
`git clone https://github.com/<your-username>/<your-fork>`
|
||||||
- Make your changes
|
- Make your changes
|
||||||
- Test your changes
|
- Test your changes
|
||||||
- Lint your changes against code guidelines
|
- Lint your changes against code guidelines
|
||||||
- `ruff check`
|
- `ruff check`
|
||||||
- `pylint <folder>/<filename>.py`
|
- `pylint <folder>/<filename>.py`
|
||||||
- Push changes to your fork
|
- Push changes to your fork
|
||||||
- Submit a PR (pull request)
|
- Submit a PR (pull request)
|
||||||
- Make sure that PR is against `dev` branch
|
- Make sure that PR is against `dev` branch
|
||||||
- Update your fork before createing PR so that it is based on latest code
|
- Update your fork before createing PR so that it is based on latest code
|
||||||
- Make sure that PR does NOT include any unrelated edits
|
- Make sure that PR does NOT include any unrelated edits
|
||||||
- Make sure that PR does not include changes to submodules
|
- Make sure that PR does not include changes to submodules
|
||||||
|
|
||||||
Your pull request will be reviewed and pending review results, merged into `dev` branch
|
Your pull request will be reviewed and pending review results, merged into `dev` branch
|
||||||
Dev merges to main are performed regularly and any PRs that are merged to `dev` will be included in the next main release
|
Dev merges to main are performed regularly and any PRs that are merged to `dev` will be included in the next main release
|
||||||
|
|
|
||||||
|
|
@ -39,7 +39,7 @@ ENV SD_MODELSDIR="/mnt/models"
|
||||||
ENV SD_DOCKER=true
|
ENV SD_DOCKER=true
|
||||||
|
|
||||||
# tcmalloc is not required but it is highly recommended
|
# tcmalloc is not required but it is highly recommended
|
||||||
ENV LD_PRELOAD=libtcmalloc.so.4
|
ENV LD_PRELOAD=libtcmalloc.so.4
|
||||||
# sdnext will run all necessary pip install ops and then exit
|
# sdnext will run all necessary pip install ops and then exit
|
||||||
RUN ["python", "/app/launch.py", "--debug", "--uv", "--use-cuda", "--log", "sdnext.log", "--test", "--optional"]
|
RUN ["python", "/app/launch.py", "--debug", "--uv", "--use-cuda", "--log", "sdnext.log", "--test", "--optional"]
|
||||||
# preinstall additional packages to avoid installation during runtime
|
# preinstall additional packages to avoid installation during runtime
|
||||||
|
|
|
||||||
|
|
@ -129425,4 +129425,4 @@
|
||||||
],
|
],
|
||||||
"byte_fallback": false
|
"byte_fallback": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -129425,4 +129425,4 @@
|
||||||
],
|
],
|
||||||
"byte_fallback": false
|
"byte_fallback": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Zoe-Tiefe"
|
"hint": "Zoe-Tiefe"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,7 @@
|
||||||
{"id":"","label":"🖼️","localized":"","hint":"Show preview"},
|
{"id":"","label":"🖼️","localized":"","hint":"Show preview"},
|
||||||
{"id":"","label":"♻","localized":"","hint":"Interrogate image"},
|
{"id":"","label":"♻","localized":"","hint":"Interrogate image"},
|
||||||
{"id":"","label":"↶","localized":"","hint":"Apply selected style to prompt"},
|
{"id":"","label":"↶","localized":"","hint":"Apply selected style to prompt"},
|
||||||
{"id":"","label":"↷","localized":"","hint":"Save current prompt to style"}
|
{"id":"","label":"↷","localized":"","hint":"Save current prompt to style"}
|
||||||
],
|
],
|
||||||
"main": [
|
"main": [
|
||||||
{"id":"","label":"Prompt","localized":"","hint":"Describe image you want to generate"},
|
{"id":"","label":"Prompt","localized":"","hint":"Describe image you want to generate"},
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "profundidad zoe"
|
"hint": "profundidad zoe"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Profondeur ZOE"
|
"hint": "Profondeur ZOE"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "zoe dubina"
|
"hint": "zoe dubina"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Profondità ZOE"
|
"hint": "Profondità ZOE"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Zoe深度"
|
"hint": "Zoe深度"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Zoe 깊이"
|
"hint": "Zoe 깊이"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Profundidade ZOE"
|
"hint": "Profundidade ZOE"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "Zoe глубина"
|
"hint": "Zoe глубина"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7553,4 +7553,4 @@
|
||||||
"hint": "ZOE深度"
|
"hint": "ZOE深度"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -1,2 +1,2 @@
|
||||||
[
|
[
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,3 @@
|
||||||
[
|
[
|
||||||
{ "id": "", "label": "Reprocess", "localized": "Ponovi", "hint": "Ponovno obradite prethodne generacije koristeći različite parametre" }
|
{ "id": "", "label": "Reprocess", "localized": "Ponovi", "hint": "Ponovno obradite prethodne generacije koristeći različite parametre" }
|
||||||
]
|
]
|
||||||
|
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -50,7 +50,7 @@
|
||||||
"preview": "dreamshaperXL_v21TurboDPMSDE.jpg",
|
"preview": "dreamshaperXL_v21TurboDPMSDE.jpg",
|
||||||
"desc": "Showcase finetuned model based on Stable diffusion XL",
|
"desc": "Showcase finetuned model based on Stable diffusion XL",
|
||||||
"extras": "sampler: DPM SDE, steps: 8, cfg_scale: 2.0"
|
"extras": "sampler: DPM SDE, steps: 8, cfg_scale: 2.0"
|
||||||
},
|
},
|
||||||
|
|
||||||
"SDXS DreamShaper 512": {
|
"SDXS DreamShaper 512": {
|
||||||
"path": "IDKiro/sdxs-512-dreamshaper",
|
"path": "IDKiro/sdxs-512-dreamshaper",
|
||||||
|
|
@ -194,37 +194,37 @@
|
||||||
"desc": "Sana is an efficient model with scaling of training-time and inference time techniques. SANA-1.5 delivers: efficient model growth from 1.6B Sana-1.0 model to 4.8B, achieving similar or better performance than training from scratch and saving 60% training cost; efficient model depth pruning, slimming any model size as you want; powerful VLM selection based inference scaling, smaller model+inference scaling > larger model.",
|
"desc": "Sana is an efficient model with scaling of training-time and inference time techniques. SANA-1.5 delivers: efficient model growth from 1.6B Sana-1.0 model to 4.8B, achieving similar or better performance than training from scratch and saving 60% training cost; efficient model depth pruning, slimming any model size as you want; powerful VLM selection based inference scaling, smaller model+inference scaling > larger model.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.5 4.8B 1k": {
|
"NVLabs Sana 1.5 4.8B 1k": {
|
||||||
"path": "Efficient-Large-Model/SANA1.5_4.8B_1024px_diffusers",
|
"path": "Efficient-Large-Model/SANA1.5_4.8B_1024px_diffusers",
|
||||||
"desc": "Sana is an efficient model with scaling of training-time and inference time techniques. SANA-1.5 delivers: efficient model growth from 1.6B Sana-1.0 model to 4.8B, achieving similar or better performance than training from scratch and saving 60% training cost; efficient model depth pruning, slimming any model size as you want; powerful VLM selection based inference scaling, smaller model+inference scaling > larger model.",
|
"desc": "Sana is an efficient model with scaling of training-time and inference time techniques. SANA-1.5 delivers: efficient model growth from 1.6B Sana-1.0 model to 4.8B, achieving similar or better performance than training from scratch and saving 60% training cost; efficient model depth pruning, slimming any model size as you want; powerful VLM selection based inference scaling, smaller model+inference scaling > larger model.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.5 1.6B 1k Sprint": {
|
"NVLabs Sana 1.5 1.6B 1k Sprint": {
|
||||||
"path": "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers",
|
"path": "Efficient-Large-Model/Sana_Sprint_1.6B_1024px_diffusers",
|
||||||
"desc": "SANA-Sprint is an ultra-efficient diffusion model for text-to-image (T2I) generation, reducing inference steps from 20 to 1-4 while achieving state-of-the-art performance.",
|
"desc": "SANA-Sprint is an ultra-efficient diffusion model for text-to-image (T2I) generation, reducing inference steps from 20 to 1-4 while achieving state-of-the-art performance.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.0 1.6B 4k": {
|
"NVLabs Sana 1.0 1.6B 4k": {
|
||||||
"path": "Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers",
|
"path": "Efficient-Large-Model/Sana_1600M_4Kpx_BF16_diffusers",
|
||||||
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.0 1.6B 2k": {
|
"NVLabs Sana 1.0 1.6B 2k": {
|
||||||
"path": "Efficient-Large-Model/Sana_1600M_2Kpx_BF16_diffusers",
|
"path": "Efficient-Large-Model/Sana_1600M_2Kpx_BF16_diffusers",
|
||||||
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.0 1.6B 1k": {
|
"NVLabs Sana 1.0 1.6B 1k": {
|
||||||
"path": "Efficient-Large-Model/Sana_1600M_1024px_diffusers",
|
"path": "Efficient-Large-Model/Sana_1600M_1024px_diffusers",
|
||||||
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
||||||
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
"preview": "Efficient-Large-Model--Sana_1600M_1024px_diffusers.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
"NVLabs Sana 1.0 0.6B 0.5k": {
|
"NVLabs Sana 1.0 0.6B 0.5k": {
|
||||||
"path": "Efficient-Large-Model/Sana_600M_512px_diffusers",
|
"path": "Efficient-Large-Model/Sana_600M_512px_diffusers",
|
||||||
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
"desc": "Sana is a text-to-image framework that can efficiently generate images up to 4096 × 4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU.",
|
||||||
|
|
@ -250,21 +250,21 @@
|
||||||
"desc": "OmniGen is a unified image generation model that can generate a wide range of images from multi-modal prompts. It is designed to be simple, flexible and easy to use.",
|
"desc": "OmniGen is a unified image generation model that can generate a wide range of images from multi-modal prompts. It is designed to be simple, flexible and easy to use.",
|
||||||
"preview": "Shitao--OmniGen-v1.jpg",
|
"preview": "Shitao--OmniGen-v1.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"VectorSpaceLab OmniGen v2": {
|
"VectorSpaceLab OmniGen v2": {
|
||||||
"path": "OmniGen2/OmniGen2",
|
"path": "OmniGen2/OmniGen2",
|
||||||
"desc": "OmniGen2 is a powerful and efficient unified multimodal model. Unlike OmniGen v1, OmniGen2 features two distinct decoding pathways for text and image modalities, utilizing unshared parameters and a decoupled image tokenizer.",
|
"desc": "OmniGen2 is a powerful and efficient unified multimodal model. Unlike OmniGen v1, OmniGen2 features two distinct decoding pathways for text and image modalities, utilizing unshared parameters and a decoupled image tokenizer.",
|
||||||
"preview": "OmniGen2--OmniGen2.jpg",
|
"preview": "OmniGen2--OmniGen2.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"AuraFlow 0.3": {
|
"AuraFlow 0.3": {
|
||||||
"path": "fal/AuraFlow-v0.3",
|
"path": "fal/AuraFlow-v0.3",
|
||||||
"desc": "AuraFlow v0.3 is the fully open-sourced flow-based text-to-image generation model. The model was trained with more compute compared to the previous version, AuraFlow-v0.2. Compared to AuraFlow-v0.2, the model is fine-tuned on more aesthetic datasets and now supports various aspect ratio, (now width and height up to 1536 pixels).",
|
"desc": "AuraFlow v0.3 is the fully open-sourced flow-based text-to-image generation model. The model was trained with more compute compared to the previous version, AuraFlow-v0.2. Compared to AuraFlow-v0.2, the model is fine-tuned on more aesthetic datasets and now supports various aspect ratio, (now width and height up to 1536 pixels).",
|
||||||
"preview": "fal--AuraFlow-v0.3.jpg",
|
"preview": "fal--AuraFlow-v0.3.jpg",
|
||||||
"skip": true
|
"skip": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"Segmind Vega": {
|
"Segmind Vega": {
|
||||||
"path": "huggingface/segmind/Segmind-Vega",
|
"path": "huggingface/segmind/Segmind-Vega",
|
||||||
|
|
@ -334,7 +334,7 @@
|
||||||
"skip": true,
|
"skip": true,
|
||||||
"extras": "sampler: Default, cfg_scale: 2.0"
|
"extras": "sampler: Default, cfg_scale: 2.0"
|
||||||
},
|
},
|
||||||
|
|
||||||
"Tencent HunyuanDiT 1.2": {
|
"Tencent HunyuanDiT 1.2": {
|
||||||
"path": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
|
"path": "Tencent-Hunyuan/HunyuanDiT-v1.2-Diffusers",
|
||||||
"desc": "Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding.",
|
"desc": "Hunyuan-DiT : A Powerful Multi-Resolution Diffusion Transformer with Fine-Grained Chinese Understanding.",
|
||||||
|
|
@ -348,7 +348,7 @@
|
||||||
"preview": "Alpha-VLLM--Lumina-Next-SFT-diffusers.jpg",
|
"preview": "Alpha-VLLM--Lumina-Next-SFT-diffusers.jpg",
|
||||||
"skip": true,
|
"skip": true,
|
||||||
"extras": "sampler: Default"
|
"extras": "sampler: Default"
|
||||||
},
|
},
|
||||||
"AlphaVLLM Lumina 2": {
|
"AlphaVLLM Lumina 2": {
|
||||||
"path": "Alpha-VLLM/Lumina-Image-2.0",
|
"path": "Alpha-VLLM/Lumina-Image-2.0",
|
||||||
"desc": "A Unified and Efficient Image Generative Model. Lumina-Image-2.0 is a 2 billion parameter flow-based diffusion transformer capable of generating images from text descriptions.",
|
"desc": "A Unified and Efficient Image Generative Model. Lumina-Image-2.0 is a 2 billion parameter flow-based diffusion transformer capable of generating images from text descriptions.",
|
||||||
|
|
@ -392,7 +392,7 @@
|
||||||
"preview": "Kwai-Kolors--Kolors-diffusers.jpg",
|
"preview": "Kwai-Kolors--Kolors-diffusers.jpg",
|
||||||
"skip": true,
|
"skip": true,
|
||||||
"extras": "width: 1024, height: 1024"
|
"extras": "width: 1024, height: 1024"
|
||||||
},
|
},
|
||||||
|
|
||||||
"Kandinsky 2.1": {
|
"Kandinsky 2.1": {
|
||||||
"path": "kandinsky-community/kandinsky-2-1",
|
"path": "kandinsky-community/kandinsky-2-1",
|
||||||
|
|
|
||||||
|
|
@ -865,4 +865,4 @@
|
||||||
linear-gradient(270deg, #696969 30%, rgba(0, 0, 0, 0) 31%);
|
linear-gradient(270deg, #696969 30%, rgba(0, 0, 0, 0) 31%);
|
||||||
background-color: #b6b6b6;
|
background-color: #b6b6b6;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@
|
||||||
.token-counter div { display: inline; }
|
.token-counter div { display: inline; }
|
||||||
.token-counter span { padding: 0.1em 0.75em; }
|
.token-counter span { padding: 0.1em 0.75em; }
|
||||||
|
|
||||||
/* tooltips and statuses */
|
/* tooltips and statuses */
|
||||||
.infotext { overflow-wrap: break-word; }
|
.infotext { overflow-wrap: break-word; }
|
||||||
.tooltip { display: block; position: fixed; top: 1em; right: 1em; padding: 0.5em; background: var(--input-background-fill); color: var(--body-text-color); border: 1pt solid var(--button-primary-border-color);
|
.tooltip { display: block; position: fixed; top: 1em; right: 1em; padding: 0.5em; background: var(--input-background-fill); color: var(--body-text-color); border: 1pt solid var(--button-primary-border-color);
|
||||||
width: 22em; min-height: 1.3em; font-size: 0.8em; transition: opacity 0.2s ease-in; pointer-events: none; opacity: 0; z-index: 999; }
|
width: 22em; min-height: 1.3em; font-size: 0.8em; transition: opacity 0.2s ease-in; pointer-events: none; opacity: 0; z-index: 999; }
|
||||||
|
|
@ -124,7 +124,7 @@ table.settings-value-table td { padding: 0.4em; border: 1px solid #ccc; max-widt
|
||||||
#tab-browser-search { padding: 0; }
|
#tab-browser-search { padding: 0; }
|
||||||
#tab-browser-status { align-content: center; text-align: right; background: var(--input-background-fill); padding-right: 1em; margin-left: -1em; }
|
#tab-browser-status { align-content: center; text-align: right; background: var(--input-background-fill); padding-right: 1em; margin-left: -1em; }
|
||||||
div:has(>#tab-browser-folders) { flex-grow: 0 !important; background-color: var(--input-background-fill); min-width: max-content !important; }
|
div:has(>#tab-browser-folders) { flex-grow: 0 !important; background-color: var(--input-background-fill); min-width: max-content !important; }
|
||||||
.browser-separator { background-color: var(--input-background-fill); font-size: larger; padding: 0.5em; display: block !important; }
|
.browser-separator { background-color: var(--input-background-fill); font-size: larger; padding: 0.5em; display: block !important; }
|
||||||
|
|
||||||
/* loader */
|
/* loader */
|
||||||
.splash { position: fixed; top: 0; left: 0; width: 100vw; height: 100vh; z-index: 1000; display: block; text-align: center; }
|
.splash { position: fixed; top: 0; left: 0; width: 100vw; height: 100vh; z-index: 1000; display: block; text-align: center; }
|
||||||
|
|
|
||||||
|
|
@ -1192,4 +1192,4 @@ svg.feather.feather-image,
|
||||||
--button-transition: none;
|
--button-transition: none;
|
||||||
--size-9: 64px;
|
--size-9: 64px;
|
||||||
--size-14: 64px;
|
--size-14: 64px;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@
|
||||||
--inactive-color: var(--primary--800);
|
--inactive-color: var(--primary--800);
|
||||||
--body-text-color: var(--neutral-100);
|
--body-text-color: var(--neutral-100);
|
||||||
--body-text-color-subdued: var(--neutral-300);
|
--body-text-color-subdued: var(--neutral-300);
|
||||||
--background-color: var(--primary-100);
|
--background-color: var(--primary-100);
|
||||||
--background-fill-primary: var(--input-background-fill);
|
--background-fill-primary: var(--input-background-fill);
|
||||||
--input-padding: 8px;
|
--input-padding: 8px;
|
||||||
--input-background-fill: var(--primary-200);
|
--input-background-fill: var(--primary-200);
|
||||||
|
|
|
||||||
|
|
@ -6,15 +6,15 @@
|
||||||
--primary-100: #2b303b;
|
--primary-100: #2b303b;
|
||||||
--primary-200: #15181e;
|
--primary-200: #15181e;
|
||||||
--primary-300: #0a0c0e;
|
--primary-300: #0a0c0e;
|
||||||
--primary-400: #566176;
|
--primary-400: #566176;
|
||||||
--primary-500: #483a90;
|
--primary-500: #483a90;
|
||||||
--primary-700: #6b7994;
|
--primary-700: #6b7994;
|
||||||
--primary-800: #5b49b3;
|
--primary-800: #5b49b3;
|
||||||
--highlight-color: var(--primary-500);
|
--highlight-color: var(--primary-500);
|
||||||
--inactive-color: var(--primary--800);
|
--inactive-color: var(--primary--800);
|
||||||
--body-text-color: var(--neutral-100);
|
--body-text-color: var(--neutral-100);
|
||||||
--body-text-color-subdued: var(--neutral-300);
|
--body-text-color-subdued: var(--neutral-300);
|
||||||
--background-color: var(--primary-100);
|
--background-color: var(--primary-100);
|
||||||
--background-fill-primary: var(--input-background-fill);
|
--background-fill-primary: var(--input-background-fill);
|
||||||
--input-padding: 8px;
|
--input-padding: 8px;
|
||||||
--input-background-fill: var(--primary-200);
|
--input-background-fill: var(--primary-200);
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,7 @@ const loginCSS = `
|
||||||
|
|
||||||
const loginHTML = `
|
const loginHTML = `
|
||||||
<div id="loginDiv" style="margin: 15% auto; max-width: 200px; padding: 2em; background: var(--background-fill-secondary);">
|
<div id="loginDiv" style="margin: 15% auto; max-width: 200px; padding: 2em; background: var(--background-fill-secondary);">
|
||||||
<h2>Login</h2>
|
<h2>Login</h2>
|
||||||
<label for="username" style="margin-top: 0.5em">Username</label>
|
<label for="username" style="margin-top: 0.5em">Username</label>
|
||||||
<input type="text" id="loginUsername" name="username" style="width: 92%; padding: 0.5em; margin-top: 0.5em">
|
<input type="text" id="loginUsername" name="username" style="width: 92%; padding: 0.5em; margin-top: 0.5em">
|
||||||
<label for="password" style="margin-top: 0.5em">Password</label>
|
<label for="password" style="margin-top: 0.5em">Password</label>
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@
|
||||||
--inactive-color: var(--primary--800);
|
--inactive-color: var(--primary--800);
|
||||||
--body-text-color: var(--neutral-100);
|
--body-text-color: var(--neutral-100);
|
||||||
--body-text-color-subdued: var(--neutral-300);
|
--body-text-color-subdued: var(--neutral-300);
|
||||||
--background-color: var(--primary-100);
|
--background-color: var(--primary-100);
|
||||||
--background-fill-primary: var(--input-background-fill);
|
--background-fill-primary: var(--input-background-fill);
|
||||||
--input-padding: 8px;
|
--input-padding: 8px;
|
||||||
--input-background-fill: var(--primary-200);
|
--input-background-fill: var(--primary-200);
|
||||||
|
|
|
||||||
|
|
@ -387,7 +387,7 @@ div:has(>#tab-gallery-folders) { flex-grow: 0 !important; background-color: var(
|
||||||
--spacing-md: 4px;
|
--spacing-md: 4px;
|
||||||
--spacing-lg: 5px;
|
--spacing-lg: 5px;
|
||||||
--spacing-xl: 6px;
|
--spacing-xl: 6px;
|
||||||
--spacing-xxl: 7px;
|
--spacing-xxl: 7px;
|
||||||
}
|
}
|
||||||
|
|
||||||
@media (hover: none) and (pointer: coarse) { /* Apply different styles for devices with coarse pointers dependant on screen resolution */
|
@media (hover: none) and (pointer: coarse) { /* Apply different styles for devices with coarse pointers dependant on screen resolution */
|
||||||
|
|
@ -396,7 +396,7 @@ div:has(>#tab-gallery-folders) { flex-grow: 0 !important; background-color: var(
|
||||||
:root, .light, .dark { --left-column: 100%; }
|
:root, .light, .dark { --left-column: 100%; }
|
||||||
#txt2img_results, #img2img_results, #extras_results { min-width: calc(min(320px, 100%)) !important;} /* maintain single column for from image operations on larger mobile devices */
|
#txt2img_results, #img2img_results, #extras_results { min-width: calc(min(320px, 100%)) !important;} /* maintain single column for from image operations on larger mobile devices */
|
||||||
#txt2img_footer p { text-wrap: wrap; }
|
#txt2img_footer p { text-wrap: wrap; }
|
||||||
}
|
}
|
||||||
@media (min-width: 400px) { /* Screens larger than 400px wide */
|
@media (min-width: 400px) { /* Screens larger than 400px wide */
|
||||||
:root, .light, .dark {--left-column: 50%;}
|
:root, .light, .dark {--left-column: 50%;}
|
||||||
#txt2img_results, #extras_results, #txt2im g_footer p { text-wrap: wrap; max-width: 100% !important; } /* maintain side by side split on larger mobile displays for from text */
|
#txt2img_results, #extras_results, #txt2im g_footer p { text-wrap: wrap; max-width: 100% !important; } /* maintain side by side split on larger mobile displays for from text */
|
||||||
|
|
@ -408,7 +408,7 @@ div:has(>#tab-gallery-folders) { flex-grow: 0 !important; background-color: var(
|
||||||
#img2img_actions_column { display: flex; min-width: fit-content !important; flex-direction: row;justify-content: space-evenly; align-items: center;}
|
#img2img_actions_column { display: flex; min-width: fit-content !important; flex-direction: row;justify-content: space-evenly; align-items: center;}
|
||||||
#txt2img_generate_box, #img2img_generate_box, #txt2img_enqueue_wrapper,#img2img_enqueue_wrapper {display: flex;flex-direction: column;height: 4em !important;align-items: stretch;justify-content: space-evenly;}
|
#txt2img_generate_box, #img2img_generate_box, #txt2img_enqueue_wrapper,#img2img_enqueue_wrapper {display: flex;flex-direction: column;height: 4em !important;align-items: stretch;justify-content: space-evenly;}
|
||||||
#img2img_interface, #img2img_results, #img2img_footer p { text-wrap: wrap; min-width: 100% !important; max-width: 100% !important;} /* maintain single column for from image operations on larger mobile devices */
|
#img2img_interface, #img2img_results, #img2img_footer p { text-wrap: wrap; min-width: 100% !important; max-width: 100% !important;} /* maintain single column for from image operations on larger mobile devices */
|
||||||
#txt2img_sampler, #txt2img_batch, #txt2img_seed_group, #txt2img_advanced, #txt2img_second_pass, #img2img_sampling_group, #img2img_resize_group, #img2img_batch_group, #img2img_seed_group, #img2img_denoise_group, #img2img_advanced_group { width: 100% !important; } /* fix from text/image UI
|
#txt2img_sampler, #txt2img_batch, #txt2img_seed_group, #txt2img_advanced, #txt2img_second_pass, #img2img_sampling_group, #img2img_resize_group, #img2img_batch_group, #img2img_seed_group, #img2img_denoise_group, #img2img_advanced_group { width: 100% !important; } /* fix from text/image UI
|
||||||
elements to prevent them from moving around within the UI */
|
elements to prevent them from moving around within the UI */
|
||||||
#img2img_resize_group .gradio-radio>div { display: flex; flex-direction: column; width: unset !important; }
|
#img2img_resize_group .gradio-radio>div { display: flex; flex-direction: column; width: unset !important; }
|
||||||
#inpaint_controls div { display:flex;flex-direction: row;}
|
#inpaint_controls div { display:flex;flex-direction: row;}
|
||||||
|
|
@ -425,5 +425,5 @@ div:has(>#tab-gallery-folders) { flex-grow: 0 !important; background-color: var(
|
||||||
.gradio-slider input[type="number"] { width: 4em; font-size: var(--text-xs); height: 16px; text-align: center; } /* adjust slider input fields as they were too large for mobile devices. */
|
.gradio-slider input[type="number"] { width: 4em; font-size: var(--text-xs); height: 16px; text-align: center; } /* adjust slider input fields as they were too large for mobile devices. */
|
||||||
#txt2img_settings .block .padded:not(.gradio-accordion) { padding: 0 !important;margin-right: 0; min-width: 100% !important; width:100% !important;}
|
#txt2img_settings .block .padded:not(.gradio-accordion) { padding: 0 !important;margin-right: 0; min-width: 100% !important; width:100% !important;}
|
||||||
#script_txt2img_prompts_from_file_prompt_txt, #script_img2img_prompts_from_file_prompt_txt, #script_control2img_prompts_from_file_prompt_txt { resize: vertical !important; }
|
#script_txt2img_prompts_from_file_prompt_txt, #script_img2img_prompts_from_file_prompt_txt, #script_control2img_prompts_from_file_prompt_txt { resize: vertical !important; }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@
|
||||||
--inactive-color: var(--primary--800);
|
--inactive-color: var(--primary--800);
|
||||||
--body-text-color: var(--neutral-100);
|
--body-text-color: var(--neutral-100);
|
||||||
--body-text-color-subdued: var(--neutral-300);
|
--body-text-color-subdued: var(--neutral-300);
|
||||||
--background-color: var(--primary-100);
|
--background-color: var(--primary-100);
|
||||||
--background-fill-primary: var(--input-background-fill);
|
--background-fill-primary: var(--input-background-fill);
|
||||||
--input-padding: 8px;
|
--input-padding: 8px;
|
||||||
--input-background-fill: var(--primary-200);
|
--input-background-fill: var(--primary-200);
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,7 @@ def optimized_scale(positive_flat, negative_flat):
|
||||||
|
|
||||||
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
||||||
st_star = dot_product / squared_norm
|
st_star = dot_product / squared_norm
|
||||||
|
|
||||||
return st_star
|
return st_star
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -662,8 +662,8 @@ class CogView4CFGZeroPipeline(DiffusionPipeline, CogView4LoraLoaderMixin):
|
||||||
return_dict=False,
|
return_dict=False,
|
||||||
)[0]
|
)[0]
|
||||||
if use_cfg_zero_star:
|
if use_cfg_zero_star:
|
||||||
positive_flat = noise_pred_cond.view(batch_size, -1)
|
positive_flat = noise_pred_cond.view(batch_size, -1)
|
||||||
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
||||||
|
|
||||||
alpha = optimized_scale(positive_flat,negative_flat)
|
alpha = optimized_scale(positive_flat,negative_flat)
|
||||||
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_cond.shape) - 1)))
|
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_cond.shape) - 1)))
|
||||||
|
|
|
||||||
|
|
@ -77,7 +77,7 @@ def optimized_scale(positive_flat, negative_flat):
|
||||||
|
|
||||||
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
||||||
st_star = dot_product / squared_norm
|
st_star = dot_product / squared_norm
|
||||||
|
|
||||||
return st_star
|
return st_star
|
||||||
|
|
||||||
DEFAULT_PROMPT_TEMPLATE = {
|
DEFAULT_PROMPT_TEMPLATE = {
|
||||||
|
|
|
||||||
|
|
@ -80,7 +80,7 @@ def optimized_scale(positive_flat, negative_flat):
|
||||||
|
|
||||||
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
||||||
st_star = dot_product / squared_norm
|
st_star = dot_product / squared_norm
|
||||||
|
|
||||||
return st_star
|
return st_star
|
||||||
|
|
||||||
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
# Copied from diffusers.pipelines.flux.pipeline_flux.calculate_shift
|
||||||
|
|
@ -1088,8 +1088,8 @@ class StableDiffusion3CFGZeroPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
|
||||||
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
|
||||||
|
|
||||||
if use_cfg_zero_star:
|
if use_cfg_zero_star:
|
||||||
positive_flat = noise_pred_text.view(batch_size, -1)
|
positive_flat = noise_pred_text.view(batch_size, -1)
|
||||||
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
||||||
|
|
||||||
alpha = optimized_scale(positive_flat,negative_flat)
|
alpha = optimized_scale(positive_flat,negative_flat)
|
||||||
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_text.shape) - 1)))
|
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_text.shape) - 1)))
|
||||||
|
|
|
||||||
|
|
@ -82,9 +82,9 @@ def optimized_scale(positive_flat, negative_flat):
|
||||||
|
|
||||||
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
# st_star = v_cond^T * v_uncond / ||v_uncond||^2
|
||||||
st_star = dot_product / squared_norm
|
st_star = dot_product / squared_norm
|
||||||
|
|
||||||
return st_star
|
return st_star
|
||||||
|
|
||||||
def basic_clean(text):
|
def basic_clean(text):
|
||||||
text = ftfy.fix_text(text)
|
text = ftfy.fix_text(text)
|
||||||
text = html.unescape(html.unescape(text))
|
text = html.unescape(html.unescape(text))
|
||||||
|
|
@ -555,8 +555,8 @@ class WanCFGZeroPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
||||||
|
|
||||||
noise_pred_text = noise_pred
|
noise_pred_text = noise_pred
|
||||||
if use_cfg_zero_star:
|
if use_cfg_zero_star:
|
||||||
positive_flat = noise_pred_text.view(batch_size, -1)
|
positive_flat = noise_pred_text.view(batch_size, -1)
|
||||||
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
negative_flat = noise_pred_uncond.view(batch_size, -1)
|
||||||
|
|
||||||
alpha = optimized_scale(positive_flat,negative_flat)
|
alpha = optimized_scale(positive_flat,negative_flat)
|
||||||
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_text.shape) - 1)))
|
alpha = alpha.view(batch_size, *([1] * (len(noise_pred_text.shape) - 1)))
|
||||||
|
|
|
||||||
|
|
@ -20,4 +20,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
|
||||||
|
|
@ -234,4 +234,3 @@ def resnext101_32x8d(pretrained=True, **kwargs):
|
||||||
|
|
||||||
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
|
model = ResNet(Bottleneck, [3, 4, 23, 3], **kwargs)
|
||||||
return model
|
return model
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -416,4 +416,3 @@ if __name__ == '__main__':
|
||||||
inputs = torch.ones(4,3,128,128)
|
inputs = torch.ones(4,3,128,128)
|
||||||
out = net(inputs)
|
out = net(inputs)
|
||||||
print(out.size())
|
print(out.size())
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,10 +2,10 @@ https://github.com/compphoto/BoostingMonocularDepth
|
||||||
|
|
||||||
Copyright 2021, Seyed Mahdi Hosseini Miangoleh, Sebastian Dille, Computational Photography Laboratory. All rights reserved.
|
Copyright 2021, Seyed Mahdi Hosseini Miangoleh, Sebastian Dille, Computational Photography Laboratory. All rights reserved.
|
||||||
|
|
||||||
This software is for academic use only. A redistribution of this
|
This software is for academic use only. A redistribution of this
|
||||||
software, with or without modifications, has to be for academic
|
software, with or without modifications, has to be for academic
|
||||||
use only, while giving the appropriate credit to the original
|
use only, while giving the appropriate credit to the original
|
||||||
authors of the software. The methods implemented as a part of
|
authors of the software. The methods implemented as a part of
|
||||||
this software may be covered under patents or patent applications.
|
this software may be covered under patents or patent applications.
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
|
THIS SOFTWARE IS PROVIDED BY THE AUTHOR ''AS IS'' AND ANY EXPRESS OR IMPLIED
|
||||||
|
|
@ -16,4 +16,4 @@ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
|
||||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
||||||
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
|
||||||
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
|
||||||
|
|
@ -165,4 +165,3 @@ class MiDaSInference(nn.Module):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
prediction = self.model(x)
|
prediction = self.model(x)
|
||||||
return prediction
|
return prediction
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -339,4 +339,3 @@ class FeatureFusionBlock_custom(nn.Module):
|
||||||
output = self.out_conv(output)
|
output = self.out_conv(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -106,4 +106,3 @@ class DPTDepthModel(DPT):
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
return super().forward(x).squeeze(dim=1)
|
return super().forward(x).squeeze(dim=1)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -198,4 +198,4 @@
|
||||||
distributed under the License is distributed on an "AS IS" BASIS,
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
See the License for the specific language governing permissions and
|
See the License for the specific language governing permissions and
|
||||||
limitations under the License.
|
limitations under the License.
|
||||||
|
|
|
||||||
|
|
@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
|
||||||
|
|
@ -199,4 +199,3 @@ class Decoder(nn.Module):
|
||||||
return [out_res8, out_res4, out_res2, out_res1], \
|
return [out_res8, out_res4, out_res2, out_res1], \
|
||||||
[out_res8, samples_pred_res4, samples_pred_res2, samples_pred_res1], \
|
[out_res8, samples_pred_res4, samples_pred_res2, samples_pred_res1], \
|
||||||
[None, point_coords_res4, point_coords_res2, point_coords_res1]
|
[None, point_coords_res4, point_coords_res2, point_coords_res1]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -319,5 +319,3 @@ export now requires additional args mentioned in the export script (not needed i
|
||||||
2. TF ported models with 'SAME' padding will have the padding fixed at export time to the resolution used for export. Even though dynamic padding is supported in opset >= 11, I can't get it working.
|
2. TF ported models with 'SAME' padding will have the padding fixed at export time to the resolution used for export. Even though dynamic padding is supported in opset >= 11, I can't get it working.
|
||||||
3. ONNX optimize facility doesn't work reliably in PyTorch 1.6 / ONNX 1.7. Fortunately, the onnxruntime based inference is working very well now and includes on the fly optimization.
|
3. ONNX optimize facility doesn't work reliably in PyTorch 1.6 / ONNX 1.7. Fortunately, the onnxruntime based inference is working very well now and includes on the fly optimization.
|
||||||
3. ONNX / Caffe2 export/import frequently breaks with different PyTorch and ONNX version releases. Please check their respective issue trackers before filing issues here.
|
3. ONNX / Caffe2 export/import frequently breaks with different PyTorch and ONNX version releases. Please check their respective issue trackers before filing issues here.
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -133,5 +133,3 @@ def get_act_layer(name='relu'):
|
||||||
if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
|
if use_jit and name in _ACT_FN_JIT: # jit scripted models should be okay for export/scripting
|
||||||
return _ACT_LAYER_JIT[name]
|
return _ACT_LAYER_JIT[name]
|
||||||
return _ACT_LAYER_DEFAULT[name]
|
return _ACT_LAYER_DEFAULT[name]
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -98,5 +98,3 @@ class HardSigmoid(nn.Module):
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
return hard_sigmoid(x, self.inplace)
|
return hard_sigmoid(x, self.inplace)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -49,4 +49,3 @@ def get_outdir(path, *paths, inc=False):
|
||||||
outdir = outdir_inc
|
outdir = outdir_inc
|
||||||
os.makedirs(outdir)
|
os.makedirs(outdir)
|
||||||
return outdir
|
return outdir
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -4,15 +4,15 @@ ACADEMIC OR NON-PROFIT ORGANIZATION NONCOMMERCIAL RESEARCH USE ONLY
|
||||||
|
|
||||||
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
BY USING OR DOWNLOADING THE SOFTWARE, YOU ARE AGREEING TO THE TERMS OF THIS LICENSE AGREEMENT. IF YOU DO NOT AGREE WITH THESE TERMS, YOU MAY NOT USE OR DOWNLOAD THE SOFTWARE.
|
||||||
|
|
||||||
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
This is a license agreement ("Agreement") between your academic institution or non-profit organization or self (called "Licensee" or "You" in this Agreement) and Carnegie Mellon University (called "Licensor" in this Agreement). All rights not specifically granted to you in this Agreement are reserved for Licensor.
|
||||||
|
|
||||||
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
RESERVATION OF OWNERSHIP AND GRANT OF LICENSE:
|
||||||
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
|
Licensor retains exclusive ownership of any copy of the Software (as defined below) licensed under this Agreement and hereby grants to Licensee a personal, non-exclusive,
|
||||||
non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
non-transferable license to use the Software for noncommercial research purposes, without the right to sublicense, pursuant to the terms and conditions of this Agreement. As used in this Agreement, the term "Software" means (i) the actual copy of all or any portion of code for program routines made accessible to Licensee by Licensor pursuant to this Agreement, inclusive of backups, updates, and/or merged copies permitted hereunder or subsequently supplied by Licensor, including all or any file structures, programming instructions, user interfaces and screen formats and sequences as well as any and all documentation and instructions related to it, and (ii) all or any derivatives and/or modifications created or made by You to any of the items specified in (i).
|
||||||
|
|
||||||
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
CONFIDENTIALITY: Licensee acknowledges that the Software is proprietary to Licensor, and as such, Licensee agrees to receive all such materials in confidence and use the Software only in accordance with the terms of this Agreement. Licensee agrees to use reasonable effort to protect the Software from unauthorized use, reproduction, distribution, or publication.
|
||||||
|
|
||||||
COPYRIGHT: The Software is owned by Licensor and is protected by United
|
COPYRIGHT: The Software is owned by Licensor and is protected by United
|
||||||
States copyright laws and applicable international treaties and/or conventions.
|
States copyright laws and applicable international treaties and/or conventions.
|
||||||
|
|
||||||
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
PERMITTED USES: The Software may be used for your own noncommercial internal research purposes. You understand and agree that Licensor is not obligated to implement any suggestions and/or feedback you might provide regarding the Software, but to the extent Licensor does so, you are not entitled to any compensation related thereto.
|
||||||
|
|
@ -35,11 +35,11 @@ FEE: Provided Licensee abides completely by the terms and conditions of this Agr
|
||||||
|
|
||||||
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
DISCLAIMER OF WARRANTIES: THE SOFTWARE IS PROVIDED "AS-IS" WITHOUT WARRANTY OF ANY KIND INCLUDING ANY WARRANTIES OF PERFORMANCE OR MERCHANTABILITY OR FITNESS FOR A PARTICULAR USE OR PURPOSE OR OF NON-INFRINGEMENT. LICENSEE BEARS ALL RISK RELATING TO QUALITY AND PERFORMANCE OF THE SOFTWARE AND RELATED MATERIALS.
|
||||||
|
|
||||||
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
SUPPORT AND MAINTENANCE: No Software support or training by the Licensor is provided as part of this Agreement.
|
||||||
|
|
||||||
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
EXCLUSIVE REMEDY AND LIMITATION OF LIABILITY: To the maximum extent permitted under applicable law, Licensor shall not be liable for direct, indirect, special, incidental, or consequential damages or lost profits related to Licensee's use of and/or inability to use the Software, even if Licensor is advised of the possibility of such damage.
|
||||||
|
|
||||||
EXPORT REGULATION: Licensee agrees to comply with any and all applicable
|
EXPORT REGULATION: Licensee agrees to comply with any and all applicable
|
||||||
U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
U.S. export control laws, regulations, and/or other laws related to embargoes and sanction programs administered by the Office of Foreign Assets Control.
|
||||||
|
|
||||||
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
SEVERABILITY: If any provision(s) of this Agreement shall be held to be invalid, illegal, or unenforceable by a court or other tribunal of competent jurisdiction, the validity, legality and enforceability of the remaining provisions shall not in any way be affected or impaired thereby.
|
||||||
|
|
@ -57,7 +57,7 @@ ENTIRE AGREEMENT AND AMENDMENTS: This Agreement constitutes the sole and entire
|
||||||
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
THIRD-PARTY SOFTWARE NOTICES AND INFORMATION
|
||||||
|
|
||||||
This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
|
This project incorporates material from the project(s) listed below (collectively, "Third Party Code"). This Third Party Code is licensed to you under their original license terms set forth below. We reserves all other rights not expressly granted, whether by implication, estoppel or otherwise.
|
||||||
|
|
||||||
1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
|
1. Caffe, version 1.0.0, (https://github.com/BVLC/caffe/)
|
||||||
|
|
||||||
COPYRIGHT
|
COPYRIGHT
|
||||||
|
|
@ -80,13 +80,13 @@ committed.
|
||||||
LICENSE
|
LICENSE
|
||||||
|
|
||||||
Redistribution and use in source and binary forms, with or without
|
Redistribution and use in source and binary forms, with or without
|
||||||
modification, are permitted provided that the following conditions are met:
|
modification, are permitted provided that the following conditions are met:
|
||||||
|
|
||||||
1. Redistributions of source code must retain the above copyright notice, this
|
1. Redistributions of source code must retain the above copyright notice, this
|
||||||
list of conditions and the following disclaimer.
|
list of conditions and the following disclaimer.
|
||||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||||
this list of conditions and the following disclaimer in the documentation
|
this list of conditions and the following disclaimer in the documentation
|
||||||
and/or other materials provided with the distribution.
|
and/or other materials provided with the distribution.
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||||
|
|
@ -105,4 +105,4 @@ By contributing to the BVLC/caffe repository through pull-request, comment,
|
||||||
or otherwise, the contributor releases their content to the
|
or otherwise, the contributor releases their content to the
|
||||||
license and copyright terms herein.
|
license and copyright terms herein.
|
||||||
|
|
||||||
************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
|
************END OF THIRD-PARTY SOFTWARE NOTICES AND INFORMATION**********
|
||||||
|
|
|
||||||
|
|
@ -155,5 +155,3 @@ def _build_sam(
|
||||||
state_dict = torch.load(f)
|
state_dict = torch.load(f)
|
||||||
sam.load_state_dict(state_dict)
|
sam.load_state_dict(state_dict)
|
||||||
return sam
|
return sam
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,4 +18,4 @@ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||||
SOFTWARE.
|
SOFTWARE.
|
||||||
|
|
|
||||||
|
|
@ -21,4 +21,3 @@
|
||||||
# SOFTWARE.
|
# SOFTWARE.
|
||||||
|
|
||||||
# File author: Shariq Farooq Bhat
|
# File author: Shariq Farooq Bhat
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -21,4 +21,3 @@
|
||||||
# SOFTWARE.
|
# SOFTWARE.
|
||||||
|
|
||||||
# File author: Shariq Farooq Bhat
|
# File author: Shariq Farooq Bhat
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -438,4 +438,3 @@ class FeatureFusionBlock_custom(nn.Module):
|
||||||
output = self.out_conv(output)
|
output = self.out_conv(output)
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,7 @@
|
||||||
"inverse_midas": false,
|
"inverse_midas": false,
|
||||||
"img_size": [384, 512]
|
"img_size": [384, 512]
|
||||||
},
|
},
|
||||||
|
|
||||||
"train": {
|
"train": {
|
||||||
"train_midas": true,
|
"train_midas": true,
|
||||||
"use_pretrained_midas": true,
|
"use_pretrained_midas": true,
|
||||||
|
|
@ -55,4 +55,4 @@
|
||||||
"use_pretrained_midas": false,
|
"use_pretrained_midas": false,
|
||||||
"pretrained_resource" : null
|
"pretrained_resource" : null
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,7 @@
|
||||||
"bin_centers_type": "normed",
|
"bin_centers_type": "normed",
|
||||||
"img_size": [384, 768]
|
"img_size": [384, 768]
|
||||||
},
|
},
|
||||||
|
|
||||||
"train": {
|
"train": {
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|
@ -19,4 +19,4 @@
|
||||||
"use_pretrained_midas": false,
|
"use_pretrained_midas": false,
|
||||||
"pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
|
"pretrained_resource" : "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_K.pt"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@
|
||||||
"min_depth": 1e-3,
|
"min_depth": 1e-3,
|
||||||
"max_depth": 80.0
|
"max_depth": 80.0
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"bin_embedding_dim": 128,
|
"bin_embedding_dim": 128,
|
||||||
"bin_centers_type": "softplus",
|
"bin_centers_type": "softplus",
|
||||||
"n_attractors":[16, 8, 4, 1],
|
"n_attractors":[16, 8, 4, 1],
|
||||||
|
|
@ -24,8 +24,8 @@
|
||||||
"attractor_kind" : "mean",
|
"attractor_kind" : "mean",
|
||||||
"attractor_type" : "inv",
|
"attractor_type" : "inv",
|
||||||
"min_temp": 0.0212,
|
"min_temp": 0.0212,
|
||||||
"max_temp": 50.0,
|
"max_temp": 50.0,
|
||||||
"memory_efficient": true,
|
"memory_efficient": true,
|
||||||
"midas_model_type" : "DPT_BEiT_L_384",
|
"midas_model_type" : "DPT_BEiT_L_384",
|
||||||
"img_size": [384, 512]
|
"img_size": [384, 512]
|
||||||
},
|
},
|
||||||
|
|
@ -58,10 +58,10 @@
|
||||||
"use_pretrained_midas": false,
|
"use_pretrained_midas": false,
|
||||||
"force_keep_ar": true
|
"force_keep_ar": true
|
||||||
},
|
},
|
||||||
|
|
||||||
"eval": {
|
"eval": {
|
||||||
"train_midas": false,
|
"train_midas": false,
|
||||||
"pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
|
"pretrained_resource": "url::https://github.com/isl-org/ZoeDepth/releases/download/v1.0/ZoeD_M12_NK.pt",
|
||||||
"use_pretrained_midas": false
|
"use_pretrained_midas": false
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -21,4 +21,3 @@
|
||||||
# SOFTWARE.
|
# SOFTWARE.
|
||||||
|
|
||||||
# File author: Shariq Farooq Bhat
|
# File author: Shariq Farooq Bhat
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -295,7 +295,7 @@ def attn_fwd(Q, K, V, bias, Cache_seqlens, Cache_batch_idx, # pylint: disable=un
|
||||||
# The tensor allocated for L is based on MAX_SEQLENS_Q as that is
|
# The tensor allocated for L is based on MAX_SEQLENS_Q as that is
|
||||||
# statically known.
|
# statically known.
|
||||||
l_offset = LSE + off_z * stride_lse_z + off_h_q * stride_lse_h + cu_seqlens_q_start * stride_lse_m
|
l_offset = LSE + off_z * stride_lse_z + off_h_q * stride_lse_h + cu_seqlens_q_start * stride_lse_m
|
||||||
l_ptrs = l_offset + offs_m * stride_lse_m
|
l_ptrs = l_offset + offs_m * stride_lse_m
|
||||||
|
|
||||||
l = tl.full([BLOCK_M], value=0.0, dtype=ACCUMULATOR_TYPE)
|
l = tl.full([BLOCK_M], value=0.0, dtype=ACCUMULATOR_TYPE)
|
||||||
|
|
||||||
|
|
@ -450,7 +450,7 @@ def attn_fwd(Q, K, V, bias, Cache_seqlens, Cache_batch_idx, # pylint: disable=un
|
||||||
|
|
||||||
# write back LSE(Log Sum Exponents), the log of the normalization constant
|
# write back LSE(Log Sum Exponents), the log of the normalization constant
|
||||||
l_offset = LSE + off_z * stride_lse_z + off_h_q * stride_lse_h + cu_seqlens_q_start * stride_lse_m
|
l_offset = LSE + off_z * stride_lse_z + off_h_q * stride_lse_h + cu_seqlens_q_start * stride_lse_m
|
||||||
l_ptrs = l_offset + offs_m * stride_lse_m
|
l_ptrs = l_offset + offs_m * stride_lse_m
|
||||||
if USE_EXP2:
|
if USE_EXP2:
|
||||||
RCP_LN2: tl.constexpr = 1.4426950408889634
|
RCP_LN2: tl.constexpr = 1.4426950408889634
|
||||||
LN2: tl.constexpr = 0.6931471824645996
|
LN2: tl.constexpr = 0.6931471824645996
|
||||||
|
|
@ -499,9 +499,9 @@ def attention_prefill_forward_triton_impl(
|
||||||
bias: Optional[torch.Tensor],
|
bias: Optional[torch.Tensor],
|
||||||
layout: Literal["bshd", "bhsd", "thd"],
|
layout: Literal["bshd", "bhsd", "thd"],
|
||||||
# varlen
|
# varlen
|
||||||
cu_seqlens_q: Optional[torch.Tensor],
|
cu_seqlens_q: Optional[torch.Tensor],
|
||||||
cu_seqlens_k: Optional[torch.Tensor],
|
cu_seqlens_k: Optional[torch.Tensor],
|
||||||
max_seqlens_q: int,
|
max_seqlens_q: int,
|
||||||
max_seqlens_k: int,
|
max_seqlens_k: int,
|
||||||
# inference
|
# inference
|
||||||
cache_seqlens: Optional[Union[(int, torch.Tensor)]],
|
cache_seqlens: Optional[Union[(int, torch.Tensor)]],
|
||||||
|
|
@ -570,7 +570,7 @@ def attention_prefill_forward_triton_impl(
|
||||||
attn_fwd[grid](q, k, v, bias, cache_seqlens, cache_batch_idx,
|
attn_fwd[grid](q, k, v, bias, cache_seqlens, cache_batch_idx,
|
||||||
sm_scale, softmax_lse, o, *q_strides, *k_strides, *v_strides, *o_strides,
|
sm_scale, softmax_lse, o, *q_strides, *k_strides, *v_strides, *o_strides,
|
||||||
*bias_strides, stride_az, stride_ah, *scores_strides, stride_lse_z, stride_lse_h, stride_lse_m, cu_seqlens_q, cu_seqlens_k,
|
*bias_strides, stride_az, stride_ah, *scores_strides, stride_lse_z, stride_lse_h, stride_lse_m, cu_seqlens_q, cu_seqlens_k,
|
||||||
dropout_p=dropout_p, philox_seed=philox_seed, philox_offset_base=philox_offset, sd_mask=sd_mask, dropout_mask=dropout_mask, alibi_slopes=alibi_slopes,
|
dropout_p=dropout_p, philox_seed=philox_seed, philox_offset_base=philox_offset, sd_mask=sd_mask, dropout_mask=dropout_mask, alibi_slopes=alibi_slopes,
|
||||||
HQ=nheads_q, HK=nheads_k, ACTUAL_BLOCK_DMODEL=head_size, MAX_SEQLENS_Q=max_seqlens_q,
|
HQ=nheads_q, HK=nheads_k, ACTUAL_BLOCK_DMODEL=head_size, MAX_SEQLENS_Q=max_seqlens_q,
|
||||||
MAX_SEQLENS_K=max_seqlens_k, IS_CAUSAL=causal, IS_VARLEN=is_varlen, IS_INFERENCE=is_inference,
|
MAX_SEQLENS_K=max_seqlens_k, IS_CAUSAL=causal, IS_VARLEN=is_varlen, IS_INFERENCE=is_inference,
|
||||||
BLOCK_DMODEL=padded_d_model, USE_BIAS=False if bias is None else True,
|
BLOCK_DMODEL=padded_d_model, USE_BIAS=False if bias is None else True,
|
||||||
|
|
|
||||||
|
|
@ -366,7 +366,7 @@ def get_shape_from_layout(
|
||||||
elif layout == 'thd':
|
elif layout == 'thd':
|
||||||
total_seqlen, num_heads, head_dim = x.shape
|
total_seqlen, num_heads, head_dim = x.shape
|
||||||
if cu_seqlens is None:
|
if cu_seqlens is None:
|
||||||
raise ValueError("cu_seqlens must be provided for varlen (thd) layout")
|
raise ValueError("cu_seqlens must be provided for varlen (thd) layout")
|
||||||
if max_seqlen is None:
|
if max_seqlen is None:
|
||||||
raise ValueError("max_seqlen must be provided for varlen (thd) layout")
|
raise ValueError("max_seqlen must be provided for varlen (thd) layout")
|
||||||
|
|
||||||
|
|
@ -389,7 +389,7 @@ def get_shapes_from_layout(q, k, layout, cu_seqlens_q = None, cu_seqlens_k = Non
|
||||||
|
|
||||||
def get_stride_from_layout(x: torch.Tensor, layout:Literal["bshd", "bhsd", "thd"]):
|
def get_stride_from_layout(x: torch.Tensor, layout:Literal["bshd", "bhsd", "thd"]):
|
||||||
if layout == 'thd':
|
if layout == 'thd':
|
||||||
strides = (0, x.stride(1), x.stride(0), x.stride(2))
|
strides = (0, x.stride(1), x.stride(0), x.stride(2))
|
||||||
elif layout == 'bhsd':
|
elif layout == 'bhsd':
|
||||||
strides = (x.stride(0), x.stride(1), x.stride(2), x.stride(3))
|
strides = (x.stride(0), x.stride(1), x.stride(2), x.stride(3))
|
||||||
elif layout == 'bshd':
|
elif layout == 'bshd':
|
||||||
|
|
|
||||||
|
|
@ -219,7 +219,7 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
self.betas = rescale_zero_terminal_snr(self.betas)
|
self.betas = rescale_zero_terminal_snr(self.betas)
|
||||||
|
|
||||||
self.alphas = 1.0 - self.betas #may have to add something for last step
|
self.alphas = 1.0 - self.betas #may have to add something for last step
|
||||||
|
|
||||||
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
|
self.alphas_cumprod = torch.cumprod(self.alphas, dim=0)
|
||||||
# At every step in ddim, we are looking into the previous alphas_cumprod
|
# At every step in ddim, we are looking into the previous alphas_cumprod
|
||||||
# For the final step, there is no previous alphas_cumprod because we are already at 0
|
# For the final step, there is no previous alphas_cumprod because we are already at 0
|
||||||
|
|
@ -357,7 +357,7 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
) -> Union[DDIMSchedulerOutput, Tuple]:
|
) -> Union[DDIMSchedulerOutput, Tuple]:
|
||||||
"""
|
"""
|
||||||
Predict the sample from the previous timestep by reversing the SDE.
|
Predict the sample from the previous timestep by reversing the SDE.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
model_output (torch.Tensor): Direct output from learned diffusion model
|
model_output (torch.Tensor): Direct output from learned diffusion model
|
||||||
timestep (int): Current discrete timestep in the diffusion chain
|
timestep (int): Current discrete timestep in the diffusion chain
|
||||||
|
|
@ -458,15 +458,15 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
alpha_prod_t_next = self.alphas_cumprod[next_timestep]
|
alpha_prod_t_next = self.alphas_cumprod[next_timestep]
|
||||||
alpha_i_plus_1 = alpha_prod_t_next ** 0.5
|
alpha_i_plus_1 = alpha_prod_t_next ** 0.5
|
||||||
sigma_i_plus_1 = (1 - alpha_prod_t_next) ** 0.5
|
sigma_i_plus_1 = (1 - alpha_prod_t_next) ** 0.5
|
||||||
|
|
||||||
if debug:
|
if debug:
|
||||||
print(f"alpha_i_plus_1: {alpha_i_plus_1}")
|
print(f"alpha_i_plus_1: {alpha_i_plus_1}")
|
||||||
print(f"sigma_i_plus_1: {sigma_i_plus_1}")
|
print(f"sigma_i_plus_1: {sigma_i_plus_1}")
|
||||||
|
|
||||||
a = alpha_i_plus_1 * pred_original_sample + sigma_i_plus_1 * pred_epsilon
|
a = alpha_i_plus_1 * pred_original_sample + sigma_i_plus_1 * pred_epsilon
|
||||||
bdia_step = (
|
bdia_step = (
|
||||||
self.config.gamma * self.next_sample[-2] +
|
self.config.gamma * self.next_sample[-2] +
|
||||||
ddim_step -
|
ddim_step -
|
||||||
(self.config.gamma * a)
|
(self.config.gamma * a)
|
||||||
)
|
)
|
||||||
self.update_next_sample_BDIA(bdia_step)
|
self.update_next_sample_BDIA(bdia_step)
|
||||||
|
|
@ -477,7 +477,7 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
if eta > 0:
|
if eta > 0:
|
||||||
if debug:
|
if debug:
|
||||||
print(f"\nApplying variance noise with eta: {eta}")
|
print(f"\nApplying variance noise with eta: {eta}")
|
||||||
|
|
||||||
if variance_noise is not None and generator is not None:
|
if variance_noise is not None and generator is not None:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Cannot pass both generator and variance_noise. Use either `generator` or `variance_noise`."
|
"Cannot pass both generator and variance_noise. Use either `generator` or `variance_noise`."
|
||||||
|
|
@ -496,7 +496,7 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
return (prev_sample,)
|
return (prev_sample,)
|
||||||
|
|
||||||
return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
|
return DDIMSchedulerOutput(prev_sample=prev_sample, pred_original_sample=pred_original_sample)
|
||||||
|
|
||||||
def add_noise(
|
def add_noise(
|
||||||
self,
|
self,
|
||||||
original_samples: torch.Tensor,
|
original_samples: torch.Tensor,
|
||||||
|
|
@ -542,10 +542,10 @@ class BDIA_DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||||
|
|
||||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||||
return velocity
|
return velocity
|
||||||
|
|
||||||
def update_next_sample_BDIA(self, new_value):
|
def update_next_sample_BDIA(self, new_value):
|
||||||
self.next_sample.append(new_value.clone())
|
self.next_sample.append(new_value.clone())
|
||||||
|
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
return self.config.num_train_timesteps
|
return self.config.num_train_timesteps
|
||||||
|
|
|
||||||
|
|
@ -740,7 +740,7 @@ class DCSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||||
t_ = ratio * (t_prev_list[-1] - t_prev_list[-2]) + t_prev_list[-2]
|
t_ = ratio * (t_prev_list[-1] - t_prev_list[-2]) + t_prev_list[-2]
|
||||||
|
|
||||||
inter_order = min(self.dc_order + 1, 4)
|
inter_order = min(self.dc_order + 1, 4)
|
||||||
|
|
||||||
if inter_order is not None:
|
if inter_order is not None:
|
||||||
model_t_dc = torch.zeros_like(model_prev_list[-1])
|
model_t_dc = torch.zeros_like(model_prev_list[-1])
|
||||||
for i in range(inter_order):
|
for i in range(inter_order):
|
||||||
|
|
@ -768,7 +768,7 @@ class DCSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||||
else:
|
else:
|
||||||
scalar_t = 0
|
scalar_t = 0
|
||||||
ratio_param = torch.nn.Parameter(torch.tensor([param_initial], device=sample.device), requires_grad=True)
|
ratio_param = torch.nn.Parameter(torch.tensor([param_initial], device=sample.device), requires_grad=True)
|
||||||
|
|
||||||
sample_clone = sample.clone()
|
sample_clone = sample.clone()
|
||||||
|
|
||||||
index = np.where(self.ddim_gt['ts'] >= scalar_t)[0].max()
|
index = np.where(self.ddim_gt['ts'] >= scalar_t)[0].max()
|
||||||
|
|
|
||||||
|
|
@ -114,25 +114,25 @@ class FlowMatchDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||||
The DPMSolver order which can be `2` or `3`. It is recommended to use `solver_order=2` for guided
|
The DPMSolver order which can be `2` or `3`. It is recommended to use `solver_order=2` for guided
|
||||||
sampling, and `solver_order=3` for unconditional sampling.
|
sampling, and `solver_order=3` for unconditional sampling.
|
||||||
algorithm_type (`str`, defaults to `dpmsolver++2M`):
|
algorithm_type (`str`, defaults to `dpmsolver++2M`):
|
||||||
Algorithm type for the solver; can be `dpmsolver2`, `dpmsolver2A`, `dpmsolver++2M`, `dpmsolver++2S`, `dpmsolver++sde`, `dpmsolver++2Msde`,
|
Algorithm type for the solver; can be `dpmsolver2`, `dpmsolver2A`, `dpmsolver++2M`, `dpmsolver++2S`, `dpmsolver++sde`, `dpmsolver++2Msde`,
|
||||||
or `dpmsolver++3Msde`.
|
or `dpmsolver++3Msde`.
|
||||||
solver_type (`str`, defaults to `midpoint`):
|
solver_type (`str`, defaults to `midpoint`):
|
||||||
Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
|
Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
|
||||||
sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
|
sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
|
||||||
sigma_schedule (`str`, *optional*, defaults to None (beta)): Sigma schedule to compute the `sigmas`. Optionally, we use
|
sigma_schedule (`str`, *optional*, defaults to None (beta)): Sigma schedule to compute the `sigmas`. Optionally, we use
|
||||||
the schedule "karras" introduced in the EDM paper (https://arxiv.org/abs/2206.00364). Other acceptable values are
|
the schedule "karras" introduced in the EDM paper (https://arxiv.org/abs/2206.00364). Other acceptable values are
|
||||||
"exponential". The exponential schedule was incorporated in this model: https://huggingface.co/stabilityai/cosxl.
|
"exponential". The exponential schedule was incorporated in this model: https://huggingface.co/stabilityai/cosxl.
|
||||||
Other acceptable values are "lambdas". The uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the
|
Other acceptable values are "lambdas". The uniform-logSNR for step sizes proposed by Lu's DPM-Solver in the
|
||||||
noise schedule during the sampling process. The sigmas and time steps are determined according to a sequence of `lambda(t)`.
|
noise schedule during the sampling process. The sigmas and time steps are determined according to a sequence of `lambda(t)`.
|
||||||
"betas" for step sizes in the noise schedule during the sampling process. Refer to [Beta
|
"betas" for step sizes in the noise schedule during the sampling process. Refer to [Beta
|
||||||
Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
|
Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
|
||||||
use_noise_sampler for BrownianTreeNoiseSampler (only valid for `dpmsolver++2S`, `dpmsolver++sde`, `dpmsolver++2Msde`, or `dpmsolver++3Msde`.
|
use_noise_sampler for BrownianTreeNoiseSampler (only valid for `dpmsolver++2S`, `dpmsolver++sde`, `dpmsolver++2Msde`, or `dpmsolver++3Msde`.
|
||||||
A noise sampler backed by a torchsde increasing the stability of convergence. Default strategy
|
A noise sampler backed by a torchsde increasing the stability of convergence. Default strategy
|
||||||
(random noise) has it jumping all over the place, but Brownian sampling is more stable. Utilizes the model generation seed provided.
|
(random noise) has it jumping all over the place, but Brownian sampling is more stable. Utilizes the model generation seed provided.
|
||||||
midpoint_ratio (`float`, *optional*, range: 0.4 to 0.6, default=0.5): Only valid for (`dpmsolver++sde`, `dpmsolver++2S`).
|
midpoint_ratio (`float`, *optional*, range: 0.4 to 0.6, default=0.5): Only valid for (`dpmsolver++sde`, `dpmsolver++2S`).
|
||||||
Higher values may result in smoothing, more vivid colors and less noise at the expense of more detail and effect.
|
Higher values may result in smoothing, more vivid colors and less noise at the expense of more detail and effect.
|
||||||
s_noise (`float`, *optional*, defaults to 1.0): Sigma noise strength: range 0 - 1.1 (only valid for `dpmsolver++2S`, `dpmsolver++sde`,
|
s_noise (`float`, *optional*, defaults to 1.0): Sigma noise strength: range 0 - 1.1 (only valid for `dpmsolver++2S`, `dpmsolver++sde`,
|
||||||
`dpmsolver++2Msde`, or `dpmsolver++3Msde`). The amount of additional noise to counteract loss of detail during sampling. A
|
`dpmsolver++2Msde`, or `dpmsolver++3Msde`). The amount of additional noise to counteract loss of detail during sampling. A
|
||||||
reasonable range is [1.000, 1.011]. Defaults to 1.0 from the original implementation.
|
reasonable range is [1.000, 1.011]. Defaults to 1.0 from the original implementation.
|
||||||
use_beta_sigmas: (`bool` defaults to False for FLUX and True for SD3). Based on original interpretation of using beta values for determining sigmas.
|
use_beta_sigmas: (`bool` defaults to False for FLUX and True for SD3). Based on original interpretation of using beta values for determining sigmas.
|
||||||
use_dynamic_shifting (`bool` defaults to False for SD3 and True for FLUX). When `True`, shift is ignored.
|
use_dynamic_shifting (`bool` defaults to False for SD3 and True for FLUX). When `True`, shift is ignored.
|
||||||
|
|
|
||||||
|
|
@ -717,7 +717,7 @@ options_templates.update(options_section(('saving-paths', "Image Paths"), {
|
||||||
"outdir_extras_samples": OptionInfo("outputs/extras", 'Folder for processed images', component_args=hide_dirs, folder=True),
|
"outdir_extras_samples": OptionInfo("outputs/extras", 'Folder for processed images', component_args=hide_dirs, folder=True),
|
||||||
"outdir_save": OptionInfo("outputs/save", "Folder for manually saved images", component_args=hide_dirs, folder=True),
|
"outdir_save": OptionInfo("outputs/save", "Folder for manually saved images", component_args=hide_dirs, folder=True),
|
||||||
"outdir_video": OptionInfo("outputs/video", "Folder for videos", component_args=hide_dirs, folder=True),
|
"outdir_video": OptionInfo("outputs/video", "Folder for videos", component_args=hide_dirs, folder=True),
|
||||||
"outdir_init_images": OptionInfo("outputs/init-images", "Folder for init images", component_args=hide_dirs, folder=True),
|
"outdir_init_images": OptionInfo("outputs/inputs", "Folder for init images", component_args=hide_dirs, folder=True),
|
||||||
|
|
||||||
"outdir_sep_grids": OptionInfo("<h2>Grids</h2>", "", gr.HTML),
|
"outdir_sep_grids": OptionInfo("<h2>Grids</h2>", "", gr.HTML),
|
||||||
"outdir_grids": OptionInfo("", "Grids folder", component_args=hide_dirs, folder=True),
|
"outdir_grids": OptionInfo("", "Grids folder", component_args=hide_dirs, folder=True),
|
||||||
|
|
|
||||||
|
|
@ -329,4 +329,4 @@ def teacache_chroma_forward(
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return (output,)
|
return (output,)
|
||||||
|
|
||||||
return Transformer2DModelOutput(sample=output)
|
return Transformer2DModelOutput(sample=output)
|
||||||
|
|
|
||||||
|
|
@ -64,10 +64,10 @@ def teacache_cog_forward(
|
||||||
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
else:
|
else:
|
||||||
if not self.config.use_rotary_positional_embeddings:
|
if not self.config.use_rotary_positional_embeddings:
|
||||||
# CogVideoX-2B
|
# CogVideoX-2B
|
||||||
coefficients = [-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03]
|
coefficients = [-3.10658903e+01, 2.54732368e+01, -5.92380459e+00, 1.75769064e+00, -3.61568434e-03]
|
||||||
else:
|
else:
|
||||||
# CogVideoX-5B and CogvideoX1.5-5B
|
# CogVideoX-5B and CogvideoX1.5-5B
|
||||||
coefficients = [-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02]
|
coefficients = [-1.53880483e+03, 8.43202495e+02, -1.34363087e+02, 7.97131516e+00, -5.23162339e-02]
|
||||||
|
|
@ -81,8 +81,8 @@ def teacache_cog_forward(
|
||||||
self.previous_modulated_input = emb
|
self.previous_modulated_input = emb
|
||||||
self.cnt += 1
|
self.cnt += 1
|
||||||
if self.cnt == self.num_steps:
|
if self.cnt == self.num_steps:
|
||||||
self.cnt = 0
|
self.cnt = 0
|
||||||
|
|
||||||
if self.enable_teacache:
|
if self.enable_teacache:
|
||||||
if not should_calc:
|
if not should_calc:
|
||||||
hidden_states += self.previous_residual
|
hidden_states += self.previous_residual
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ def teacache_hidream_forward(
|
||||||
else:
|
else:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
self.previous_modulated_input = modulated_inp
|
self.previous_modulated_input = modulated_inp
|
||||||
self.cnt += 1
|
self.cnt += 1
|
||||||
if self.cnt == self.num_steps:
|
if self.cnt == self.num_steps:
|
||||||
self.cnt = 0
|
self.cnt = 0
|
||||||
|
|
@ -262,4 +262,4 @@ def teacache_hidream_forward(
|
||||||
|
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return (output,)
|
return (output,)
|
||||||
return Transformer2DModelOutput(sample=output)
|
return Transformer2DModelOutput(sample=output)
|
||||||
|
|
|
||||||
|
|
@ -69,7 +69,7 @@ def teacache_ltx_forward(
|
||||||
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
else:
|
else:
|
||||||
coefficients = [2.14700694e+01, -1.28016453e+01, 2.31279151e+00, 7.92487521e-01, 9.69274326e-03]
|
coefficients = [2.14700694e+01, -1.28016453e+01, 2.31279151e+00, 7.92487521e-01, 9.69274326e-03]
|
||||||
rescale_func = np.poly1d(coefficients)
|
rescale_func = np.poly1d(coefficients)
|
||||||
self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
|
self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
|
||||||
|
|
@ -78,11 +78,11 @@ def teacache_ltx_forward(
|
||||||
else:
|
else:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
self.previous_modulated_input = modulated_inp
|
self.previous_modulated_input = modulated_inp
|
||||||
self.cnt += 1
|
self.cnt += 1
|
||||||
if self.cnt == self.num_steps:
|
if self.cnt == self.num_steps:
|
||||||
self.cnt = 0
|
self.cnt = 0
|
||||||
|
|
||||||
if self.enable_teacache:
|
if self.enable_teacache:
|
||||||
if not should_calc:
|
if not should_calc:
|
||||||
hidden_states += self.previous_residual
|
hidden_states += self.previous_residual
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,7 @@ def teacache_mochi_forward(
|
||||||
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
if self.cnt == 0 or self.cnt == self.num_steps-1:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
else:
|
else:
|
||||||
coefficients = [-3.51241319e+03, 8.11675948e+02, -6.09400215e+01, 2.42429681e+00, 3.05291719e-03]
|
coefficients = [-3.51241319e+03, 8.11675948e+02, -6.09400215e+01, 2.42429681e+00, 3.05291719e-03]
|
||||||
rescale_func = np.poly1d(coefficients)
|
rescale_func = np.poly1d(coefficients)
|
||||||
self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
|
self.accumulated_rel_l1_distance += rescale_func(((modulated_inp-self.previous_modulated_input).abs().mean() / self.previous_modulated_input.abs().mean()).cpu().item())
|
||||||
|
|
@ -74,11 +74,11 @@ def teacache_mochi_forward(
|
||||||
else:
|
else:
|
||||||
should_calc = True
|
should_calc = True
|
||||||
self.accumulated_rel_l1_distance = 0
|
self.accumulated_rel_l1_distance = 0
|
||||||
self.previous_modulated_input = modulated_inp
|
self.previous_modulated_input = modulated_inp
|
||||||
self.cnt += 1
|
self.cnt += 1
|
||||||
if self.cnt == self.num_steps:
|
if self.cnt == self.num_steps:
|
||||||
self.cnt = 0
|
self.cnt = 0
|
||||||
|
|
||||||
if self.enable_teacache:
|
if self.enable_teacache:
|
||||||
if not should_calc:
|
if not should_calc:
|
||||||
hidden_states += self.previous_residual
|
hidden_states += self.previous_residual
|
||||||
|
|
@ -112,7 +112,7 @@ def teacache_mochi_forward(
|
||||||
image_rotary_emb=image_rotary_emb,
|
image_rotary_emb=image_rotary_emb,
|
||||||
)
|
)
|
||||||
hidden_states = self.norm_out(hidden_states, temb)
|
hidden_states = self.norm_out(hidden_states, temb)
|
||||||
self.previous_residual = hidden_states - ori_hidden_states
|
self.previous_residual = hidden_states - ori_hidden_states
|
||||||
else:
|
else:
|
||||||
for i, block in enumerate(self.transformer_blocks):
|
for i, block in enumerate(self.transformer_blocks):
|
||||||
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
if torch.is_grad_enabled() and self.gradient_checkpointing:
|
||||||
|
|
|
||||||
|
|
@ -22,7 +22,8 @@
|
||||||
"eslint": "eslint javascript/ extensions-builtin/sdnext-modernui/javascript/",
|
"eslint": "eslint javascript/ extensions-builtin/sdnext-modernui/javascript/",
|
||||||
"ruff": ". venv/bin/activate && ruff check",
|
"ruff": ". venv/bin/activate && ruff check",
|
||||||
"pylint": ". venv/bin/activate && pylint *.py modules/ pipelines/ scripts/ extensions-builtin/ | grep -v '^*'",
|
"pylint": ". venv/bin/activate && pylint *.py modules/ pipelines/ scripts/ extensions-builtin/ | grep -v '^*'",
|
||||||
"lint": "npm run eslint && npm run ruff && npm run pylint | grep -v TODO",
|
"format": ". venv/bin/activate && pre-commit run -a",
|
||||||
|
"lint": "npm run eslint && npm run format && npm run ruff && npm run pylint | grep -v TODO",
|
||||||
"todo": "npm run pylint | grep W0511 | awk -F'TODO ' '{print \"- \"$NF}' | sed 's/ (fixme)//g' | sort",
|
"todo": "npm run pylint | grep W0511 | awk -F'TODO ' '{print \"- \"$NF}' | sed 's/ (fixme)//g' | sort",
|
||||||
"test": ". venv/bin/activate; python launch.py --debug --test"
|
"test": ". venv/bin/activate; python launch.py --debug --test"
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -54,7 +54,7 @@ class Pipeline(DiffusionPipeline):
|
||||||
vqvae: VQModel
|
vqvae: VQModel
|
||||||
tokenizer: CLIPTokenizer
|
tokenizer: CLIPTokenizer
|
||||||
text_encoder: CLIPTextModelWithProjection
|
text_encoder: CLIPTextModelWithProjection
|
||||||
transformer: Transformer2DModel
|
transformer: Transformer2DModel
|
||||||
scheduler: Scheduler
|
scheduler: Scheduler
|
||||||
# tokenizer_t5: T5Tokenizer
|
# tokenizer_t5: T5Tokenizer
|
||||||
# text_encoder_t5: T5ForConditionalGeneration
|
# text_encoder_t5: T5ForConditionalGeneration
|
||||||
|
|
@ -66,7 +66,7 @@ class Pipeline(DiffusionPipeline):
|
||||||
vqvae: VQModel,
|
vqvae: VQModel,
|
||||||
tokenizer: CLIPTokenizer,
|
tokenizer: CLIPTokenizer,
|
||||||
text_encoder: CLIPTextModelWithProjection,
|
text_encoder: CLIPTextModelWithProjection,
|
||||||
transformer: Transformer2DModel,
|
transformer: Transformer2DModel,
|
||||||
scheduler: Scheduler,
|
scheduler: Scheduler,
|
||||||
# tokenizer_t5: T5Tokenizer,
|
# tokenizer_t5: T5Tokenizer,
|
||||||
# text_encoder_t5: T5ForConditionalGeneration,
|
# text_encoder_t5: T5ForConditionalGeneration,
|
||||||
|
|
@ -226,8 +226,8 @@ class Pipeline(DiffusionPipeline):
|
||||||
# truncation=True,
|
# truncation=True,
|
||||||
# max_length=512,
|
# max_length=512,
|
||||||
# ).input_ids.to(self._execution_device)
|
# ).input_ids.to(self._execution_device)
|
||||||
|
|
||||||
|
|
||||||
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
|
outputs = self.text_encoder(input_ids, return_dict=True, output_hidden_states=True)
|
||||||
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
|
# outputs_t5 = self.text_encoder_t5(input_ids_t5, decoder_input_ids = input_ids_t5 ,return_dict=True, output_hidden_states=True)
|
||||||
prompt_embeds = outputs.text_embeds
|
prompt_embeds = outputs.text_embeds
|
||||||
|
|
@ -265,8 +265,8 @@ class Pipeline(DiffusionPipeline):
|
||||||
negative_prompt_embeds = outputs.text_embeds
|
negative_prompt_embeds = outputs.text_embeds
|
||||||
negative_encoder_hidden_states = outputs.hidden_states[-2]
|
negative_encoder_hidden_states = outputs.hidden_states[-2]
|
||||||
# negative_encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
|
# negative_encoder_hidden_states = outputs_t5.encoder_hidden_states[-2]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
|
negative_prompt_embeds = negative_prompt_embeds.repeat(num_images_per_prompt, 1)
|
||||||
negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
|
negative_encoder_hidden_states = negative_encoder_hidden_states.repeat(num_images_per_prompt, 1, 1)
|
||||||
|
|
@ -370,4 +370,4 @@ class Pipeline(DiffusionPipeline):
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return (output,)
|
return (output,)
|
||||||
|
|
||||||
return ImagePipelineOutput(output)
|
return ImagePipelineOutput(output)
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ pipe = pipe.to(device)
|
||||||
|
|
||||||
steps = 64
|
steps = 64
|
||||||
guidance_scale = 9
|
guidance_scale = 9
|
||||||
resolution = 1024
|
resolution = 1024
|
||||||
negative = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark"
|
negative = "worst quality, low quality, low res, blurry, distortion, watermark, logo, signature, text, jpeg artifacts, signature, sketch, duplicate, ugly, identifying mark"
|
||||||
prompt = "Beautiful young woman posing on a lake with snow covered mountains in the background"
|
prompt = "Beautiful young woman posing on a lake with snow covered mountains in the background"
|
||||||
image = pipe(prompt=prompt, negative_prompt=negative, height=resolution, width=resolution, guidance_scale=guidance_scale, num_inference_steps=steps).images[0]
|
image = pipe(prompt=prompt, negative_prompt=negative, height=resolution, width=resolution, guidance_scale=guidance_scale, num_inference_steps=steps).images[0]
|
||||||
|
|
|
||||||
|
|
@ -34,7 +34,7 @@ from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZ
|
||||||
from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
|
||||||
from diffusers.utils.torch_utils import maybe_allow_in_graph
|
from diffusers.utils.torch_utils import maybe_allow_in_graph
|
||||||
from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings,TimestepEmbedding, get_timestep_embedding #,FluxPosEmbed
|
from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings,TimestepEmbedding, get_timestep_embedding #,FluxPosEmbed
|
||||||
from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
from diffusers.models.modeling_outputs import Transformer2DModelOutput
|
||||||
from diffusers.models.resnet import Downsample2D, Upsample2D
|
from diffusers.models.resnet import Downsample2D, Upsample2D
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
@ -794,8 +794,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
|
guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_supports_gradient_checkpointing = False #True
|
_supports_gradient_checkpointing = False #True
|
||||||
# Due to NotImplementedError: DDPOptimizer backend: Found a higher order op in the graph. This is not supported. Please turn off DDP optimizer using torch._dynamo.config.optimize_ddp=False. Note that this can cause performance degradation because there will be one bucket for the entire Dynamo graph.
|
# Due to NotImplementedError: DDPOptimizer backend: Found a higher order op in the graph. This is not supported. Please turn off DDP optimizer using torch._dynamo.config.optimize_ddp=False. Note that this can cause performance degradation because there will be one bucket for the entire Dynamo graph.
|
||||||
# Please refer to this issue - https://github.com/pytorch/pytorch/issues/104674.
|
# Please refer to this issue - https://github.com/pytorch/pytorch/issues/104674.
|
||||||
_no_split_modules = ["TransformerBlock", "SingleTransformerBlock"]
|
_no_split_modules = ["TransformerBlock", "SingleTransformerBlock"]
|
||||||
|
|
||||||
|
|
@ -819,7 +819,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
):
|
):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.out_channels = in_channels
|
self.out_channels = in_channels
|
||||||
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
self.inner_dim = self.config.num_attention_heads * self.config.attention_head_dim
|
||||||
|
|
||||||
self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
|
self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
|
||||||
text_time_guidance_cls = (
|
text_time_guidance_cls = (
|
||||||
|
|
@ -830,7 +830,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
)
|
)
|
||||||
|
|
||||||
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
|
self.context_embedder = nn.Linear(self.config.joint_attention_dim, self.inner_dim)
|
||||||
|
|
||||||
self.transformer_blocks = nn.ModuleList(
|
self.transformer_blocks = nn.ModuleList(
|
||||||
[
|
[
|
||||||
TransformerBlock(
|
TransformerBlock(
|
||||||
|
|
@ -856,7 +856,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
|
|
||||||
self.gradient_checkpointing = False
|
self.gradient_checkpointing = False
|
||||||
|
|
||||||
in_channels_embed = self.inner_dim
|
in_channels_embed = self.inner_dim
|
||||||
ln_elementwise_affine = True
|
ln_elementwise_affine = True
|
||||||
layer_norm_eps = 1e-06
|
layer_norm_eps = 1e-06
|
||||||
use_bias = False
|
use_bias = False
|
||||||
|
|
@ -867,7 +867,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
self.mlm_layer = ConvMlmLayer(
|
self.mlm_layer = ConvMlmLayer(
|
||||||
self.inner_dim, in_channels_embed, use_bias, ln_elementwise_affine, layer_norm_eps, self.config.codebook_size
|
self.inner_dim, in_channels_embed, use_bias, ln_elementwise_affine, layer_norm_eps, self.config.codebook_size
|
||||||
)
|
)
|
||||||
self.cond_embed = TimestepEmbedding(
|
self.cond_embed = TimestepEmbedding(
|
||||||
micro_cond_embed_dim + self.config.pooled_projection_dim, self.inner_dim, sample_proj_bias=use_bias
|
micro_cond_embed_dim + self.config.pooled_projection_dim, self.inner_dim, sample_proj_bias=use_bias
|
||||||
)
|
)
|
||||||
self.encoder_proj_layer_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
|
self.encoder_proj_layer_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
|
||||||
|
|
@ -875,9 +875,9 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
self.project_to_hidden = nn.Linear(in_channels_embed, self.inner_dim, bias=use_bias)
|
self.project_to_hidden = nn.Linear(in_channels_embed, self.inner_dim, bias=use_bias)
|
||||||
self.project_from_hidden_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
|
self.project_from_hidden_norm = RMSNorm(self.inner_dim, layer_norm_eps, ln_elementwise_affine)
|
||||||
self.project_from_hidden = nn.Linear(self.inner_dim, in_channels_embed, bias=use_bias)
|
self.project_from_hidden = nn.Linear(self.inner_dim, in_channels_embed, bias=use_bias)
|
||||||
|
|
||||||
self.down_block = Simple_UVitBlock(
|
self.down_block = Simple_UVitBlock(
|
||||||
self.inner_dim,
|
self.inner_dim,
|
||||||
ln_elementwise_affine,
|
ln_elementwise_affine,
|
||||||
layer_norm_eps,
|
layer_norm_eps,
|
||||||
use_bias,
|
use_bias,
|
||||||
|
|
@ -892,7 +892,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
False,
|
False,
|
||||||
upsample=upsample,
|
upsample=upsample,
|
||||||
)
|
)
|
||||||
|
|
||||||
# self.fuse_qkv_projections()
|
# self.fuse_qkv_projections()
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|
@ -1043,26 +1043,26 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
micro_cond_encode_dim = 256 # same as self.config.micro_cond_encode_dim = 256 from amused
|
micro_cond_encode_dim = 256 # same as self.config.micro_cond_encode_dim = 256 from amused
|
||||||
micro_cond_embeds = get_timestep_embedding(
|
micro_cond_embeds = get_timestep_embedding(
|
||||||
micro_conds.flatten(), micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0
|
micro_conds.flatten(), micro_cond_encode_dim, flip_sin_to_cos=True, downscale_freq_shift=0
|
||||||
)
|
)
|
||||||
micro_cond_embeds = micro_cond_embeds.reshape((hidden_states.shape[0], -1))
|
micro_cond_embeds = micro_cond_embeds.reshape((hidden_states.shape[0], -1))
|
||||||
|
|
||||||
pooled_projections = torch.cat([pooled_projections, micro_cond_embeds], dim=1)
|
pooled_projections = torch.cat([pooled_projections, micro_cond_embeds], dim=1)
|
||||||
pooled_projections = pooled_projections.to(dtype=self.dtype)
|
pooled_projections = pooled_projections.to(dtype=self.dtype)
|
||||||
pooled_projections = self.cond_embed(pooled_projections).to(encoder_hidden_states.dtype)
|
pooled_projections = self.cond_embed(pooled_projections).to(encoder_hidden_states.dtype)
|
||||||
|
|
||||||
|
|
||||||
hidden_states = self.embed(hidden_states)
|
|
||||||
|
|
||||||
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
|
hidden_states = self.embed(hidden_states)
|
||||||
|
|
||||||
|
encoder_hidden_states = self.context_embedder(encoder_hidden_states)
|
||||||
encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
|
encoder_hidden_states = self.encoder_proj_layer_norm(encoder_hidden_states)
|
||||||
hidden_states = self.down_block(hidden_states)
|
hidden_states = self.down_block(hidden_states)
|
||||||
|
|
||||||
batch_size, channels, height, width = hidden_states.shape
|
batch_size, channels, height, width = hidden_states.shape
|
||||||
hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
|
hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels)
|
||||||
hidden_states = self.project_to_hidden_norm(hidden_states)
|
hidden_states = self.project_to_hidden_norm(hidden_states)
|
||||||
hidden_states = self.project_to_hidden(hidden_states)
|
hidden_states = self.project_to_hidden(hidden_states)
|
||||||
|
|
||||||
|
|
||||||
if joint_attention_kwargs is not None:
|
if joint_attention_kwargs is not None:
|
||||||
joint_attention_kwargs = joint_attention_kwargs.copy()
|
joint_attention_kwargs = joint_attention_kwargs.copy()
|
||||||
lora_scale = joint_attention_kwargs.pop("scale", 1.0)
|
lora_scale = joint_attention_kwargs.pop("scale", 1.0)
|
||||||
|
|
@ -1083,11 +1083,11 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
guidance = guidance.to(hidden_states.dtype) * 1000
|
guidance = guidance.to(hidden_states.dtype) * 1000
|
||||||
else:
|
else:
|
||||||
guidance = None
|
guidance = None
|
||||||
temb = (
|
temb = (
|
||||||
self.time_text_embed(timestep, pooled_projections)
|
self.time_text_embed(timestep, pooled_projections)
|
||||||
if guidance is None
|
if guidance is None
|
||||||
else self.time_text_embed(timestep, guidance, pooled_projections)
|
else self.time_text_embed(timestep, guidance, pooled_projections)
|
||||||
)
|
)
|
||||||
|
|
||||||
if txt_ids.ndim == 3:
|
if txt_ids.ndim == 3:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
|
|
@ -1102,8 +1102,8 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
)
|
)
|
||||||
img_ids = img_ids[0]
|
img_ids = img_ids[0]
|
||||||
ids = torch.cat((txt_ids, img_ids), dim=0)
|
ids = torch.cat((txt_ids, img_ids), dim=0)
|
||||||
|
|
||||||
image_rotary_emb = self.pos_embed(ids)
|
image_rotary_emb = self.pos_embed(ids)
|
||||||
|
|
||||||
for index_block, block in enumerate(self.transformer_blocks):
|
for index_block, block in enumerate(self.transformer_blocks):
|
||||||
if self.training and self.gradient_checkpointing:
|
if self.training and self.gradient_checkpointing:
|
||||||
|
|
@ -1131,10 +1131,10 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
encoder_hidden_states, hidden_states = block(
|
encoder_hidden_states, hidden_states = block(
|
||||||
hidden_states=hidden_states,
|
hidden_states=hidden_states,
|
||||||
encoder_hidden_states=encoder_hidden_states,
|
encoder_hidden_states=encoder_hidden_states,
|
||||||
temb=temb,
|
temb=temb,
|
||||||
image_rotary_emb=image_rotary_emb,
|
image_rotary_emb=image_rotary_emb,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
# controlnet residual
|
# controlnet residual
|
||||||
if controlnet_block_samples is not None:
|
if controlnet_block_samples is not None:
|
||||||
|
|
@ -1181,12 +1181,12 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
+ controlnet_single_block_samples[index_block // interval_control]
|
+ controlnet_single_block_samples[index_block // interval_control]
|
||||||
)
|
)
|
||||||
|
|
||||||
hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
|
hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
hidden_states = self.project_from_hidden_norm(hidden_states)
|
hidden_states = self.project_from_hidden_norm(hidden_states)
|
||||||
hidden_states = self.project_from_hidden(hidden_states)
|
hidden_states = self.project_from_hidden(hidden_states)
|
||||||
|
|
||||||
|
|
||||||
hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
|
hidden_states = hidden_states.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
|
||||||
|
|
||||||
|
|
@ -1195,11 +1195,11 @@ class Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
|
||||||
if USE_PEFT_BACKEND:
|
if USE_PEFT_BACKEND:
|
||||||
# remove `lora_scale` from each PEFT layer
|
# remove `lora_scale` from each PEFT layer
|
||||||
unscale_lora_layers(self, lora_scale)
|
unscale_lora_layers(self, lora_scale)
|
||||||
|
|
||||||
output = self.mlm_layer(hidden_states)
|
output = self.mlm_layer(hidden_states)
|
||||||
# self.unfuse_qkv_projections()
|
# self.unfuse_qkv_projections()
|
||||||
if not return_dict:
|
if not return_dict:
|
||||||
return (output,)
|
return (output,)
|
||||||
|
|
||||||
|
|
||||||
return output
|
return output
|
||||||
|
|
|
||||||
|
|
@ -54,8 +54,8 @@ class FMPipelineOutput(BaseOutput):
|
||||||
Output class for OmniGen2 pipeline.
|
Output class for OmniGen2 pipeline.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
images (Union[List[PIL.Image.Image], np.ndarray]):
|
images (Union[List[PIL.Image.Image], np.ndarray]):
|
||||||
List of denoised PIL images of length `batch_size` or numpy array of shape
|
List of denoised PIL images of length `batch_size` or numpy array of shape
|
||||||
`(batch_size, height, width, num_channels)`. Contains the generated images.
|
`(batch_size, height, width, num_channels)`. Contains the generated images.
|
||||||
"""
|
"""
|
||||||
images: Union[List[PIL.Image.Image], np.ndarray]
|
images: Union[List[PIL.Image.Image], np.ndarray]
|
||||||
|
|
|
||||||
|
|
@ -2,34 +2,34 @@ S-Lab License 1.0
|
||||||
|
|
||||||
Copyright 2022 S-Lab
|
Copyright 2022 S-Lab
|
||||||
|
|
||||||
Redistribution and use for non-commercial purpose in source and
|
Redistribution and use for non-commercial purpose in source and
|
||||||
binary forms, with or without modification, are permitted provided
|
binary forms, with or without modification, are permitted provided
|
||||||
that the following conditions are met:
|
that the following conditions are met:
|
||||||
|
|
||||||
1. Redistributions of source code must retain the above copyright
|
1. Redistributions of source code must retain the above copyright
|
||||||
notice, this list of conditions and the following disclaimer.
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
2. Redistributions in binary form must reproduce the above copyright
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
notice, this list of conditions and the following disclaimer in
|
notice, this list of conditions and the following disclaimer in
|
||||||
the documentation and/or other materials provided with the
|
the documentation and/or other materials provided with the
|
||||||
distribution.
|
distribution.
|
||||||
|
|
||||||
3. Neither the name of the copyright holder nor the names of its
|
3. Neither the name of the copyright holder nor the names of its
|
||||||
contributors may be used to endorse or promote products derived
|
contributors may be used to endorse or promote products derived
|
||||||
from this software without specific prior written permission.
|
from this software without specific prior written permission.
|
||||||
|
|
||||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||||
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||||||
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
In the event that redistribution and/or use for commercial purpose in
|
In the event that redistribution and/or use for commercial purpose in
|
||||||
source or binary forms, with or without modification is required,
|
source or binary forms, with or without modification is required,
|
||||||
please contact the contributor(s) of the work.
|
please contact the contributor(s) of the work.
|
||||||
|
|
|
||||||
|
|
@ -242,4 +242,4 @@ class ResNetArcFace(nn.Module):
|
||||||
x = self.fc5(x)
|
x = self.fc5(x)
|
||||||
x = self.bn5(x)
|
x = self.bn5(x)
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
|
||||||
|
|
@ -315,4 +315,4 @@ to_1tuple = _ntuple(1)
|
||||||
to_2tuple = _ntuple(2)
|
to_2tuple = _ntuple(2)
|
||||||
to_3tuple = _ntuple(3)
|
to_3tuple = _ntuple(3)
|
||||||
to_4tuple = _ntuple(4)
|
to_4tuple = _ntuple(4)
|
||||||
to_ntuple = _ntuple
|
to_ntuple = _ntuple
|
||||||
|
|
|
||||||
|
|
@ -119,7 +119,7 @@ class TransformerSALayer(nn.Module):
|
||||||
tgt_mask: Optional[Tensor] = None,
|
tgt_mask: Optional[Tensor] = None,
|
||||||
tgt_key_padding_mask: Optional[Tensor] = None,
|
tgt_key_padding_mask: Optional[Tensor] = None,
|
||||||
query_pos: Optional[Tensor] = None):
|
query_pos: Optional[Tensor] = None):
|
||||||
|
|
||||||
# self attention
|
# self attention
|
||||||
tgt2 = self.norm1(tgt)
|
tgt2 = self.norm1(tgt)
|
||||||
q = k = self.with_pos_embed(tgt2, query_pos)
|
q = k = self.with_pos_embed(tgt2, query_pos)
|
||||||
|
|
@ -159,7 +159,7 @@ class Fuse_sft_block(nn.Module):
|
||||||
|
|
||||||
@ARCH_REGISTRY.register()
|
@ARCH_REGISTRY.register()
|
||||||
class CodeFormer(VQAutoEncoder):
|
class CodeFormer(VQAutoEncoder):
|
||||||
def __init__(self, dim_embd=512, n_head=8, n_layers=9,
|
def __init__(self, dim_embd=512, n_head=8, n_layers=9,
|
||||||
codebook_size=1024, latent_size=256,
|
codebook_size=1024, latent_size=256,
|
||||||
connect_list=['32', '64', '128', '256'],
|
connect_list=['32', '64', '128', '256'],
|
||||||
fix_modules=['quantize','generator']):
|
fix_modules=['quantize','generator']):
|
||||||
|
|
@ -179,14 +179,14 @@ class CodeFormer(VQAutoEncoder):
|
||||||
self.feat_emb = nn.Linear(256, self.dim_embd)
|
self.feat_emb = nn.Linear(256, self.dim_embd)
|
||||||
|
|
||||||
# transformer
|
# transformer
|
||||||
self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
|
self.ft_layers = nn.Sequential(*[TransformerSALayer(embed_dim=dim_embd, nhead=n_head, dim_mlp=self.dim_mlp, dropout=0.0)
|
||||||
for _ in range(self.n_layers)])
|
for _ in range(self.n_layers)])
|
||||||
|
|
||||||
# logits_predict head
|
# logits_predict head
|
||||||
self.idx_pred_layer = nn.Sequential(
|
self.idx_pred_layer = nn.Sequential(
|
||||||
nn.LayerNorm(dim_embd),
|
nn.LayerNorm(dim_embd),
|
||||||
nn.Linear(dim_embd, codebook_size, bias=False))
|
nn.Linear(dim_embd, codebook_size, bias=False))
|
||||||
|
|
||||||
self.channels = {
|
self.channels = {
|
||||||
'16': 512,
|
'16': 512,
|
||||||
'32': 256,
|
'32': 256,
|
||||||
|
|
@ -221,7 +221,7 @@ class CodeFormer(VQAutoEncoder):
|
||||||
enc_feat_dict = {}
|
enc_feat_dict = {}
|
||||||
out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
|
out_list = [self.fuse_encoder_block[f_size] for f_size in self.connect_list]
|
||||||
for i, block in enumerate(self.encoder.blocks):
|
for i, block in enumerate(self.encoder.blocks):
|
||||||
x = block(x)
|
x = block(x)
|
||||||
if i in out_list:
|
if i in out_list:
|
||||||
enc_feat_dict[str(x.shape[-1])] = x.clone()
|
enc_feat_dict[str(x.shape[-1])] = x.clone()
|
||||||
|
|
||||||
|
|
@ -266,11 +266,11 @@ class CodeFormer(VQAutoEncoder):
|
||||||
fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
|
fuse_list = [self.fuse_generator_block[f_size] for f_size in self.connect_list]
|
||||||
|
|
||||||
for i, block in enumerate(self.generator.blocks):
|
for i, block in enumerate(self.generator.blocks):
|
||||||
x = block(x)
|
x = block(x)
|
||||||
if i in fuse_list: # fuse after i-th block
|
if i in fuse_list: # fuse after i-th block
|
||||||
f_size = str(x.shape[-1])
|
f_size = str(x.shape[-1])
|
||||||
if w>0:
|
if w>0:
|
||||||
x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
|
x = self.fuse_convs_dict[f_size](enc_feat_dict[f_size].detach(), x, w)
|
||||||
out = x
|
out = x
|
||||||
# logits doesn't need softmax before cross_entropy loss
|
# logits doesn't need softmax before cross_entropy loss
|
||||||
return out, logits, lq_feat
|
return out, logits, lq_feat
|
||||||
|
|
|
||||||
|
|
@ -116,4 +116,4 @@ class RRDBNet(nn.Module):
|
||||||
feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
|
feat = self.lrelu(self.conv_up1(F.interpolate(feat, scale_factor=2, mode='nearest')))
|
||||||
feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
|
feat = self.lrelu(self.conv_up2(F.interpolate(feat, scale_factor=2, mode='nearest')))
|
||||||
out = self.conv_last(self.lrelu(self.conv_hr(feat)))
|
out = self.conv_last(self.lrelu(self.conv_hr(feat)))
|
||||||
return out
|
return out
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ from basicsr.utils.registry import ARCH_REGISTRY
|
||||||
|
|
||||||
def normalize(in_channels):
|
def normalize(in_channels):
|
||||||
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
return torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
|
||||||
|
|
||||||
|
|
||||||
@torch.jit.script
|
@torch.jit.script
|
||||||
def swish(x):
|
def swish(x):
|
||||||
|
|
@ -209,15 +209,15 @@ class AttnBlock(nn.Module):
|
||||||
# compute attention
|
# compute attention
|
||||||
b, c, h, w = q.shape
|
b, c, h, w = q.shape
|
||||||
q = q.reshape(b, c, h*w)
|
q = q.reshape(b, c, h*w)
|
||||||
q = q.permute(0, 2, 1)
|
q = q.permute(0, 2, 1)
|
||||||
k = k.reshape(b, c, h*w)
|
k = k.reshape(b, c, h*w)
|
||||||
w_ = torch.bmm(q, k)
|
w_ = torch.bmm(q, k)
|
||||||
w_ = w_ * (int(c)**(-0.5))
|
w_ = w_ * (int(c)**(-0.5))
|
||||||
w_ = F.softmax(w_, dim=2)
|
w_ = F.softmax(w_, dim=2)
|
||||||
|
|
||||||
# attend to values
|
# attend to values
|
||||||
v = v.reshape(b, c, h*w)
|
v = v.reshape(b, c, h*w)
|
||||||
w_ = w_.permute(0, 2, 1)
|
w_ = w_.permute(0, 2, 1)
|
||||||
h_ = torch.bmm(v, w_)
|
h_ = torch.bmm(v, w_)
|
||||||
h_ = h_.reshape(b, c, h, w)
|
h_ = h_.reshape(b, c, h, w)
|
||||||
|
|
||||||
|
|
@ -269,18 +269,18 @@ class Encoder(nn.Module):
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for block in self.blocks:
|
for block in self.blocks:
|
||||||
x = block(x)
|
x = block(x)
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
class Generator(nn.Module):
|
class Generator(nn.Module):
|
||||||
def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
|
def __init__(self, nf, emb_dim, ch_mult, res_blocks, img_size, attn_resolutions):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
self.nf = nf
|
self.nf = nf
|
||||||
self.ch_mult = ch_mult
|
self.ch_mult = ch_mult
|
||||||
self.num_resolutions = len(self.ch_mult)
|
self.num_resolutions = len(self.ch_mult)
|
||||||
self.num_res_blocks = res_blocks
|
self.num_res_blocks = res_blocks
|
||||||
self.resolution = img_size
|
self.resolution = img_size
|
||||||
self.attn_resolutions = attn_resolutions
|
self.attn_resolutions = attn_resolutions
|
||||||
self.in_channels = emb_dim
|
self.in_channels = emb_dim
|
||||||
self.out_channels = 3
|
self.out_channels = 3
|
||||||
|
|
@ -314,24 +314,24 @@ class Generator(nn.Module):
|
||||||
blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
|
blocks.append(nn.Conv2d(block_in_ch, self.out_channels, kernel_size=3, stride=1, padding=1))
|
||||||
|
|
||||||
self.blocks = nn.ModuleList(blocks)
|
self.blocks = nn.ModuleList(blocks)
|
||||||
|
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
for block in self.blocks:
|
for block in self.blocks:
|
||||||
x = block(x)
|
x = block(x)
|
||||||
|
|
||||||
return x
|
return x
|
||||||
|
|
||||||
|
|
||||||
@ARCH_REGISTRY.register()
|
@ARCH_REGISTRY.register()
|
||||||
class VQAutoEncoder(nn.Module):
|
class VQAutoEncoder(nn.Module):
|
||||||
def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
|
def __init__(self, img_size, nf, ch_mult, quantizer="nearest", res_blocks=2, attn_resolutions=[16], codebook_size=1024, emb_dim=256,
|
||||||
beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
|
beta=0.25, gumbel_straight_through=False, gumbel_kl_weight=1e-8, model_path=None):
|
||||||
super().__init__()
|
super().__init__()
|
||||||
logger = get_root_logger()
|
logger = get_root_logger()
|
||||||
self.in_channels = 3
|
self.in_channels = 3
|
||||||
self.nf = nf
|
self.nf = nf
|
||||||
self.n_blocks = res_blocks
|
self.n_blocks = res_blocks
|
||||||
self.codebook_size = codebook_size
|
self.codebook_size = codebook_size
|
||||||
self.embed_dim = emb_dim
|
self.embed_dim = emb_dim
|
||||||
self.ch_mult = ch_mult
|
self.ch_mult = ch_mult
|
||||||
|
|
@ -362,11 +362,11 @@ class VQAutoEncoder(nn.Module):
|
||||||
self.kl_weight
|
self.kl_weight
|
||||||
)
|
)
|
||||||
self.generator = Generator(
|
self.generator = Generator(
|
||||||
self.nf,
|
self.nf,
|
||||||
self.embed_dim,
|
self.embed_dim,
|
||||||
self.ch_mult,
|
self.ch_mult,
|
||||||
self.n_blocks,
|
self.n_blocks,
|
||||||
self.resolution,
|
self.resolution,
|
||||||
self.attn_resolutions
|
self.attn_resolutions
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -431,4 +431,4 @@ class VQGANDiscriminator(nn.Module):
|
||||||
raise ValueError(f'Wrong params!')
|
raise ValueError(f'Wrong params!')
|
||||||
|
|
||||||
def forward(self, x):
|
def forward(self, x):
|
||||||
return self.main(x)
|
return self.main(x)
|
||||||
|
|
|
||||||
|
|
@ -254,8 +254,8 @@ class PerceptualLoss(nn.Module):
|
||||||
|
|
||||||
@LOSS_REGISTRY.register()
|
@LOSS_REGISTRY.register()
|
||||||
class LPIPSLoss(nn.Module):
|
class LPIPSLoss(nn.Module):
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
loss_weight=1.0,
|
loss_weight=1.0,
|
||||||
use_input_norm=True,
|
use_input_norm=True,
|
||||||
range_norm=False,):
|
range_norm=False,):
|
||||||
super(LPIPSLoss, self).__init__()
|
super(LPIPSLoss, self).__init__()
|
||||||
|
|
|
||||||
|
|
@ -130,7 +130,7 @@ def train_pipeline(root_path):
|
||||||
|
|
||||||
# initialize loggers
|
# initialize loggers
|
||||||
logger, tb_logger = init_loggers(opt)
|
logger, tb_logger = init_loggers(opt)
|
||||||
|
|
||||||
# create train and validation dataloaders
|
# create train and validation dataloaders
|
||||||
result = create_train_val_dataloader(opt, logger)
|
result = create_train_val_dataloader(opt, logger)
|
||||||
train_loader, train_sampler, val_loader, total_epochs, total_iters = result
|
train_loader, train_sampler, val_loader, total_epochs, total_iters = result
|
||||||
|
|
|
||||||
|
|
@ -92,4 +92,4 @@ def load_file_from_url(url, model_dir=None, progress=True, file_name=None):
|
||||||
if not os.path.exists(cached_file):
|
if not os.path.exists(cached_file):
|
||||||
print(f'Downloading: "{url}" to {cached_file}\n')
|
print(f'Downloading: "{url}" to {cached_file}\n')
|
||||||
download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
|
download_url_to_file(url, cached_file, hash_prefix=None, progress=progress)
|
||||||
return cached_file
|
return cached_file
|
||||||
|
|
|
||||||
|
|
@ -168,4 +168,3 @@ def crop_border(imgs, crop_border):
|
||||||
return [v[crop_border:-crop_border, crop_border:-crop_border, ...] for v in imgs]
|
return [v[crop_border:-crop_border, crop_border:-crop_border, ...] for v in imgs]
|
||||||
else:
|
else:
|
||||||
return imgs[crop_border:-crop_border, crop_border:-crop_border, ...]
|
return imgs[crop_border:-crop_border, crop_border:-crop_border, ...]
|
||||||
|
|
||||||
|
|
@ -166,4 +166,4 @@ def get_env_info():
|
||||||
f'\n\tBasicSR: {__version__}'
|
f'\n\tBasicSR: {__version__}'
|
||||||
f'\n\tPyTorch: {torch.__version__}'
|
f'\n\tPyTorch: {torch.__version__}'
|
||||||
f'\n\tTorchVision: {torchvision.__version__}')
|
f'\n\tTorchVision: {torchvision.__version__}')
|
||||||
return msg
|
return msg
|
||||||
|
|
|
||||||
|
|
@ -209,9 +209,9 @@ class RealESRGANer():
|
||||||
if img_mode == 'L':
|
if img_mode == 'L':
|
||||||
output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY)
|
output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY)
|
||||||
del output_img_t
|
del output_img_t
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
except RuntimeError as error:
|
except RuntimeError as error:
|
||||||
print(f"Failed inference for RealESRGAN: {error}")
|
print(f"Failed inference for RealESRGAN: {error}")
|
||||||
|
|
||||||
# ------------------- process the alpha channel if necessary ------------------- #
|
# ------------------- process the alpha channel if necessary ------------------- #
|
||||||
if img_mode == 'RGBA':
|
if img_mode == 'RGBA':
|
||||||
|
|
@ -296,4 +296,4 @@ class IOConsumer(threading.Thread):
|
||||||
output = msg['output']
|
output = msg['output']
|
||||||
save_path = msg['save_path']
|
save_path = msg['save_path']
|
||||||
cv2.imwrite(save_path, output)
|
cv2.imwrite(save_path, output)
|
||||||
print(f'IO worker {self.qid} is done.')
|
print(f'IO worker {self.qid} is done.')
|
||||||
|
|
|
||||||
|
|
@ -122,4 +122,4 @@ class VideoWriter:
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
self.stream_writer.stdin.close()
|
self.stream_writer.stdin.close()
|
||||||
self.stream_writer.wait()
|
self.stream_writer.wait()
|
||||||
|
|
|
||||||
|
|
@ -119,9 +119,9 @@ class YoloDetector:
|
||||||
origimgs = copy.deepcopy(images)
|
origimgs = copy.deepcopy(images)
|
||||||
|
|
||||||
images = self._preprocess(images)
|
images = self._preprocess(images)
|
||||||
|
|
||||||
if IS_HIGH_VERSION:
|
if IS_HIGH_VERSION:
|
||||||
with torch.inference_mode(): # for pytorch>=1.9
|
with torch.inference_mode(): # for pytorch>=1.9
|
||||||
pred = self.detector(images)[0]
|
pred = self.detector(images)[0]
|
||||||
else:
|
else:
|
||||||
with torch.no_grad(): # for pytorch<1.9
|
with torch.no_grad(): # for pytorch<1.9
|
||||||
|
|
|
||||||
|
|
@ -44,4 +44,4 @@ head:
|
||||||
[-1, 3, C3, [1024, False]], # 22 (P5/32-large)
|
[-1, 3, C3, [1024, False]], # 22 (P5/32-large)
|
||||||
|
|
||||||
[[16, 19, 22], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
|
[[16, 19, 22], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -2,4 +2,4 @@ import torch
|
||||||
import sys
|
import sys
|
||||||
sys.path.insert(0,'./facelib/detection/yolov5face')
|
sys.path.insert(0,'./facelib/detection/yolov5face')
|
||||||
model = torch.load('facelib/detection/yolov5face/yolov5n-face.pt', map_location='cpu')['model']
|
model = torch.load('facelib/detection/yolov5face/yolov5n-face.pt', map_location='cpu')['model']
|
||||||
torch.save(model.state_dict(),'weights/facelib/yolov5n-face.pth')
|
torch.save(model.state_dict(),'weights/facelib/yolov5n-face.pth')
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,6 @@ from .face_utils import align_crop_face_landmarks, compute_increased_bbox, get_v
|
||||||
from .misc import img2tensor, load_file_from_url, download_pretrained_models, scandir
|
from .misc import img2tensor, load_file_from_url, download_pretrained_models, scandir
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
'align_crop_face_landmarks', 'compute_increased_bbox', 'get_valid_bboxes', 'load_file_from_url',
|
'align_crop_face_landmarks', 'compute_increased_bbox', 'get_valid_bboxes', 'load_file_from_url',
|
||||||
'download_pretrained_models', 'paste_face_back', 'img2tensor', 'scandir'
|
'download_pretrained_models', 'paste_face_back', 'img2tensor', 'scandir'
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ class FaceRestoreHelper(object):
|
||||||
if self.template_3points:
|
if self.template_3points:
|
||||||
self.face_template = np.array([[192, 240], [319, 240], [257, 371]])
|
self.face_template = np.array([[192, 240], [319, 240], [257, 371]])
|
||||||
else:
|
else:
|
||||||
# standard 5 landmarks for FFHQ faces with 512 x 512
|
# standard 5 landmarks for FFHQ faces with 512 x 512
|
||||||
# facexlib
|
# facexlib
|
||||||
self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935],
|
self.face_template = np.array([[192.98138, 239.94708], [318.90277, 240.1936], [256.63416, 314.01935],
|
||||||
[201.26117, 371.41043], [313.08905, 371.15118]])
|
[201.26117, 371.41043], [313.08905, 371.15118]])
|
||||||
|
|
@ -170,7 +170,7 @@ class FaceRestoreHelper(object):
|
||||||
landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)])
|
landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)])
|
||||||
self.all_landmarks_5.append(landmark)
|
self.all_landmarks_5.append(landmark)
|
||||||
self.det_faces.append(bbox[0:5])
|
self.det_faces.append(bbox[0:5])
|
||||||
|
|
||||||
if len(self.det_faces) == 0:
|
if len(self.det_faces) == 0:
|
||||||
return 0
|
return 0
|
||||||
if only_keep_largest:
|
if only_keep_largest:
|
||||||
|
|
@ -317,7 +317,7 @@ class FaceRestoreHelper(object):
|
||||||
|
|
||||||
assert len(self.restored_faces) == len(
|
assert len(self.restored_faces) == len(
|
||||||
self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.')
|
self.inverse_affine_matrices), ('length of restored_faces and affine_matrices are different.')
|
||||||
|
|
||||||
inv_mask_borders = []
|
inv_mask_borders = []
|
||||||
for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices):
|
for restored_face, inverse_affine in zip(self.restored_faces, self.inverse_affine_matrices):
|
||||||
if face_upsampler is not None:
|
if face_upsampler is not None:
|
||||||
|
|
@ -457,4 +457,4 @@ class FaceRestoreHelper(object):
|
||||||
self.cropped_faces = []
|
self.cropped_faces = []
|
||||||
self.inverse_affine_matrices = []
|
self.inverse_affine_matrices = []
|
||||||
self.det_faces = []
|
self.det_faces = []
|
||||||
self.pad_input_imgs = []
|
self.pad_input_imgs = []
|
||||||
|
|
|
||||||
|
|
@ -13,7 +13,7 @@ ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__fil
|
||||||
|
|
||||||
def download_pretrained_models(file_ids, save_path_root):
|
def download_pretrained_models(file_ids, save_path_root):
|
||||||
import gdown
|
import gdown
|
||||||
|
|
||||||
os.makedirs(save_path_root, exist_ok=True)
|
os.makedirs(save_path_root, exist_ok=True)
|
||||||
|
|
||||||
for file_name, file_id in file_ids.items():
|
for file_name, file_id in file_ids.items():
|
||||||
|
|
|
||||||
|
|
@ -52,20 +52,20 @@ if __name__ == '__main__':
|
||||||
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
|
|
||||||
parser.add_argument('-i', '--input_path', type=str, default='./inputs/whole_imgs',
|
parser.add_argument('-i', '--input_path', type=str, default='./inputs/whole_imgs',
|
||||||
help='Input image, video or folder. Default: inputs/whole_imgs')
|
help='Input image, video or folder. Default: inputs/whole_imgs')
|
||||||
parser.add_argument('-o', '--output_path', type=str, default=None,
|
parser.add_argument('-o', '--output_path', type=str, default=None,
|
||||||
help='Output folder. Default: results/<input_name>_<w>')
|
help='Output folder. Default: results/<input_name>_<w>')
|
||||||
parser.add_argument('-w', '--fidelity_weight', type=float, default=0.5,
|
parser.add_argument('-w', '--fidelity_weight', type=float, default=0.5,
|
||||||
help='Balance the quality and fidelity. Default: 0.5')
|
help='Balance the quality and fidelity. Default: 0.5')
|
||||||
parser.add_argument('-s', '--upscale', type=int, default=2,
|
parser.add_argument('-s', '--upscale', type=int, default=2,
|
||||||
help='The final upsampling scale of the image. Default: 2')
|
help='The final upsampling scale of the image. Default: 2')
|
||||||
parser.add_argument('--has_aligned', action='store_true', help='Input are cropped and aligned faces. Default: False')
|
parser.add_argument('--has_aligned', action='store_true', help='Input are cropped and aligned faces. Default: False')
|
||||||
parser.add_argument('--only_center_face', action='store_true', help='Only restore the center face. Default: False')
|
parser.add_argument('--only_center_face', action='store_true', help='Only restore the center face. Default: False')
|
||||||
parser.add_argument('--draw_box', action='store_true', help='Draw the bounding box for the detected faces. Default: False')
|
parser.add_argument('--draw_box', action='store_true', help='Draw the bounding box for the detected faces. Default: False')
|
||||||
# large det_model: 'YOLOv5l', 'retinaface_resnet50'
|
# large det_model: 'YOLOv5l', 'retinaface_resnet50'
|
||||||
# small det_model: 'YOLOv5n', 'retinaface_mobile0.25'
|
# small det_model: 'YOLOv5n', 'retinaface_mobile0.25'
|
||||||
parser.add_argument('--detection_model', type=str, default='retinaface_resnet50',
|
parser.add_argument('--detection_model', type=str, default='retinaface_resnet50',
|
||||||
help='Face detector. Optional: retinaface_resnet50, retinaface_mobile0.25, YOLOv5l, YOLOv5n. \
|
help='Face detector. Optional: retinaface_resnet50, retinaface_mobile0.25, YOLOv5l, YOLOv5n. \
|
||||||
Default: retinaface_resnet50')
|
Default: retinaface_resnet50')
|
||||||
parser.add_argument('--bg_upsampler', type=str, default='None', help='Background upsampler. Optional: realesrgan')
|
parser.add_argument('--bg_upsampler', type=str, default='None', help='Background upsampler. Optional: realesrgan')
|
||||||
|
|
@ -91,7 +91,7 @@ if __name__ == '__main__':
|
||||||
input_img_list.append(image)
|
input_img_list.append(image)
|
||||||
image = vidreader.get_frame()
|
image = vidreader.get_frame()
|
||||||
audio = vidreader.get_audio()
|
audio = vidreader.get_audio()
|
||||||
fps = vidreader.get_fps() if args.save_video_fps is None else args.save_video_fps
|
fps = vidreader.get_fps() if args.save_video_fps is None else args.save_video_fps
|
||||||
video_name = os.path.basename(args.input_path)[:-4]
|
video_name = os.path.basename(args.input_path)[:-4]
|
||||||
result_root = f'results/{video_name}_{w}'
|
result_root = f'results/{video_name}_{w}'
|
||||||
input_video = True
|
input_video = True
|
||||||
|
|
@ -127,11 +127,11 @@ if __name__ == '__main__':
|
||||||
face_upsampler = None
|
face_upsampler = None
|
||||||
|
|
||||||
# ------------------ set up CodeFormer restorer -------------------
|
# ------------------ set up CodeFormer restorer -------------------
|
||||||
net = ARCH_REGISTRY.get('CodeFormer')(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9,
|
net = ARCH_REGISTRY.get('CodeFormer')(dim_embd=512, codebook_size=1024, n_head=8, n_layers=9,
|
||||||
connect_list=['32', '64', '128', '256']).to(device)
|
connect_list=['32', '64', '128', '256']).to(device)
|
||||||
|
|
||||||
# ckpt_path = 'weights/CodeFormer/codeformer.pth'
|
# ckpt_path = 'weights/CodeFormer/codeformer.pth'
|
||||||
ckpt_path = load_file_from_url(url=pretrain_model_url['restoration'],
|
ckpt_path = load_file_from_url(url=pretrain_model_url['restoration'],
|
||||||
model_dir='weights/CodeFormer', progress=True, file_name=None)
|
model_dir='weights/CodeFormer', progress=True, file_name=None)
|
||||||
checkpoint = torch.load(ckpt_path)['params_ema']
|
checkpoint = torch.load(ckpt_path)['params_ema']
|
||||||
net.load_state_dict(checkpoint)
|
net.load_state_dict(checkpoint)
|
||||||
|
|
@ -140,9 +140,9 @@ if __name__ == '__main__':
|
||||||
# ------------------ set up FaceRestoreHelper -------------------
|
# ------------------ set up FaceRestoreHelper -------------------
|
||||||
# large det_model: 'YOLOv5l', 'retinaface_resnet50'
|
# large det_model: 'YOLOv5l', 'retinaface_resnet50'
|
||||||
# small det_model: 'YOLOv5n', 'retinaface_mobile0.25'
|
# small det_model: 'YOLOv5n', 'retinaface_mobile0.25'
|
||||||
if not args.has_aligned:
|
if not args.has_aligned:
|
||||||
print(f'Face detection model: {args.detection_model}')
|
print(f'Face detection model: {args.detection_model}')
|
||||||
if bg_upsampler is not None:
|
if bg_upsampler is not None:
|
||||||
print(f'Background upsampling: True, Face upsampling: {args.face_upsample}')
|
print(f'Background upsampling: True, Face upsampling: {args.face_upsample}')
|
||||||
else:
|
else:
|
||||||
print(f'Background upsampling: False, Face upsampling: {args.face_upsample}')
|
print(f'Background upsampling: False, Face upsampling: {args.face_upsample}')
|
||||||
|
|
@ -160,7 +160,7 @@ if __name__ == '__main__':
|
||||||
for i, img_path in enumerate(input_img_list):
|
for i, img_path in enumerate(input_img_list):
|
||||||
# clean all the intermediate results to process the next image
|
# clean all the intermediate results to process the next image
|
||||||
face_helper.clean_all()
|
face_helper.clean_all()
|
||||||
|
|
||||||
if isinstance(img_path, str):
|
if isinstance(img_path, str):
|
||||||
img_name = os.path.basename(img_path)
|
img_name = os.path.basename(img_path)
|
||||||
basename, ext = os.path.splitext(img_name)
|
basename, ext = os.path.splitext(img_name)
|
||||||
|
|
@ -172,7 +172,7 @@ if __name__ == '__main__':
|
||||||
print(f'[{i+1}/{test_img_num}] Processing: {img_name}')
|
print(f'[{i+1}/{test_img_num}] Processing: {img_name}')
|
||||||
img = img_path
|
img = img_path
|
||||||
|
|
||||||
if args.has_aligned:
|
if args.has_aligned:
|
||||||
# the input faces are already cropped and aligned
|
# the input faces are already cropped and aligned
|
||||||
img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
|
img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
|
||||||
face_helper.is_gray = is_gray(img, threshold=5)
|
face_helper.is_gray = is_gray(img, threshold=5)
|
||||||
|
|
@ -218,7 +218,7 @@ if __name__ == '__main__':
|
||||||
bg_img = None
|
bg_img = None
|
||||||
face_helper.get_inverse_affine(None)
|
face_helper.get_inverse_affine(None)
|
||||||
# paste each restored face to the input image
|
# paste each restored face to the input image
|
||||||
if args.face_upsample and face_upsampler is not None:
|
if args.face_upsample and face_upsampler is not None:
|
||||||
restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box, face_upsampler=face_upsampler)
|
restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box, face_upsampler=face_upsampler)
|
||||||
else:
|
else:
|
||||||
restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box)
|
restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box)
|
||||||
|
|
@ -226,7 +226,7 @@ if __name__ == '__main__':
|
||||||
# save faces
|
# save faces
|
||||||
for idx, (cropped_face, restored_face) in enumerate(zip(face_helper.cropped_faces, face_helper.restored_faces)):
|
for idx, (cropped_face, restored_face) in enumerate(zip(face_helper.cropped_faces, face_helper.restored_faces)):
|
||||||
# save cropped face
|
# save cropped face
|
||||||
if not args.has_aligned:
|
if not args.has_aligned:
|
||||||
save_crop_path = os.path.join(result_root, 'cropped_faces', f'{basename}_{idx:02d}.png')
|
save_crop_path = os.path.join(result_root, 'cropped_faces', f'{basename}_{idx:02d}.png')
|
||||||
imwrite(cropped_face, save_crop_path)
|
imwrite(cropped_face, save_crop_path)
|
||||||
# save restored face
|
# save restored face
|
||||||
|
|
@ -261,7 +261,7 @@ if __name__ == '__main__':
|
||||||
video_name = f'{video_name}_{args.suffix}.png'
|
video_name = f'{video_name}_{args.suffix}.png'
|
||||||
save_restore_path = os.path.join(result_root, f'{video_name}.mp4')
|
save_restore_path = os.path.join(result_root, f'{video_name}.mp4')
|
||||||
vidwriter = VideoWriter(save_restore_path, height, width, fps, audio)
|
vidwriter = VideoWriter(save_restore_path, height, width, fps, audio)
|
||||||
|
|
||||||
for f in video_frames:
|
for f in video_frames:
|
||||||
vidwriter.write_frame(f)
|
vidwriter.write_frame(f)
|
||||||
vidwriter.close()
|
vidwriter.close()
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue