sdxl support (for SAI/HuggingFace/diffuser/community models) (#1952)

sdxl support (for SAI/HuggingFace/diffuser/community models) 1.1.400
2023-09-04 10:41:42 -07:00 · 2023-09-04 10:41:42 -07:00 · 1d5402326d
parent 664ac74cae
commit 1d5402326d
68 changed files with 2649 additions and 150192 deletions
--- a/687
+++ b/687
@ -1,21 +1,674 @@
-MIT License
+                    GNU GENERAL PUBLIC LICENSE
+                       Version 3, 29 June 2007

-Copyright (c) 2023 Kakigōri Maker
+ Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.

-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
+                            Preamble

-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
+  The GNU General Public License is a free, copyleft license for
+software and other kinds of works.

-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
+  The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works.  By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users.  We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors.  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+  To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights.  Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received.  You must make sure that they, too, receive
+or can get the source code.  And you must show them these terms so they
+know their rights.
+
+  Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+  For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software.  For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+  Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so.  This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software.  The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable.  Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products.  If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+  Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary.  To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+                       TERMS AND CONDITIONS
+
+  0. Definitions.
+
+  "This License" refers to version 3 of the GNU General Public License.
+
+  "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+  "The Program" refers to any copyrightable work licensed under this
+License.  Each licensee is addressed as "you".  "Licensees" and
+"recipients" may be individuals or organizations.
+
+  To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy.  The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+  A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+  To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy.  Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+  To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies.  Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+  An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License.  If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+  1. Source Code.
+
+  The "source code" for a work means the preferred form of the work
+for making modifications to it.  "Object code" means any non-source
+form of a work.
+
+  A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+  The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form.  A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+  The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities.  However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work.  For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+  The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+  The Corresponding Source for a work in source code form is that
+same work.
+
+  2. Basic Permissions.
+
+  All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met.  This License explicitly affirms your unlimited
+permission to run the unmodified Program.  The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work.  This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+  You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force.  You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright.  Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+  Conveying under any other circumstances is permitted solely under
+the conditions stated below.  Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+  3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+  No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+  When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+  4. Conveying Verbatim Copies.
+
+  You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+  You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+  5. Conveying Modified Source Versions.
+
+  You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+    a) The work must carry prominent notices stating that you modified
+    it, and giving a relevant date.
+
+    b) The work must carry prominent notices stating that it is
+    released under this License and any conditions added under section
+    7.  This requirement modifies the requirement in section 4 to
+    "keep intact all notices".
+
+    c) You must license the entire work, as a whole, under this
+    License to anyone who comes into possession of a copy.  This
+    License will therefore apply, along with any applicable section 7
+    additional terms, to the whole of the work, and all its parts,
+    regardless of how they are packaged.  This License gives no
+    permission to license the work in any other way, but it does not
+    invalidate such permission if you have separately received it.
+
+    d) If the work has interactive user interfaces, each must display
+    Appropriate Legal Notices; however, if the Program has interactive
+    interfaces that do not display Appropriate Legal Notices, your
+    work need not make them do so.
+
+  A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit.  Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+  6. Conveying Non-Source Forms.
+
+  You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+    a) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by the
+    Corresponding Source fixed on a durable physical medium
+    customarily used for software interchange.
+
+    b) Convey the object code in, or embodied in, a physical product
+    (including a physical distribution medium), accompanied by a
+    written offer, valid for at least three years and valid for as
+    long as you offer spare parts or customer support for that product
+    model, to give anyone who possesses the object code either (1) a
+    copy of the Corresponding Source for all the software in the
+    product that is covered by this License, on a durable physical
+    medium customarily used for software interchange, for a price no
+    more than your reasonable cost of physically performing this
+    conveying of source, or (2) access to copy the
+    Corresponding Source from a network server at no charge.
+
+    c) Convey individual copies of the object code with a copy of the
+    written offer to provide the Corresponding Source.  This
+    alternative is allowed only occasionally and noncommercially, and
+    only if you received the object code with such an offer, in accord
+    with subsection 6b.
+
+    d) Convey the object code by offering access from a designated
+    place (gratis or for a charge), and offer equivalent access to the
+    Corresponding Source in the same way through the same place at no
+    further charge.  You need not require recipients to copy the
+    Corresponding Source along with the object code.  If the place to
+    copy the object code is a network server, the Corresponding Source
+    may be on a different server (operated by you or a third party)
+    that supports equivalent copying facilities, provided you maintain
+    clear directions next to the object code saying where to find the
+    Corresponding Source.  Regardless of what server hosts the
+    Corresponding Source, you remain obligated to ensure that it is
+    available for as long as needed to satisfy these requirements.
+
+    e) Convey the object code using peer-to-peer transmission, provided
+    you inform other peers where the object code and Corresponding
+    Source of the work are being offered to the general public at no
+    charge under subsection 6d.
+
+  A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+  A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling.  In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage.  For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product.  A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+  "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source.  The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+  If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information.  But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+  The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed.  Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+  Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+  7. Additional Terms.
+
+  "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law.  If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+  When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it.  (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.)  You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+  Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+    a) Disclaiming warranty or limiting liability differently from the
+    terms of sections 15 and 16 of this License; or
+
+    b) Requiring preservation of specified reasonable legal notices or
+    author attributions in that material or in the Appropriate Legal
+    Notices displayed by works containing it; or
+
+    c) Prohibiting misrepresentation of the origin of that material, or
+    requiring that modified versions of such material be marked in
+    reasonable ways as different from the original version; or
+
+    d) Limiting the use for publicity purposes of names of licensors or
+    authors of the material; or
+
+    e) Declining to grant rights under trademark law for use of some
+    trade names, trademarks, or service marks; or
+
+    f) Requiring indemnification of licensors and authors of that
+    material by anyone who conveys the material (or modified versions of
+    it) with contractual assumptions of liability to the recipient, for
+    any liability that these contractual assumptions directly impose on
+    those licensors and authors.
+
+  All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10.  If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term.  If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+  If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+  Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+  8. Termination.
+
+  You may not propagate or modify a covered work except as expressly
+provided under this License.  Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+  However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+  Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+  Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License.  If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+  9. Acceptance Not Required for Having Copies.
+
+  You are not required to accept this License in order to receive or
+run a copy of the Program.  Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance.  However,
+nothing other than this License grants you permission to propagate or
+modify any covered work.  These actions infringe copyright if you do
+not accept this License.  Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+  10. Automatic Licensing of Downstream Recipients.
+
+  Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License.  You are not responsible
+for enforcing compliance by third parties with this License.
+
+  An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations.  If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+  You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License.  For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+  11. Patents.
+
+  A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based.  The
+work thus licensed is called the contributor's "contributor version".
+
+  A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version.  For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+  Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+  In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement).  To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+  If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients.  "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+  If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+  A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License.  You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+  Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+  12. No Surrender of Others' Freedom.
+
+  If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all.  For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+  13. Use with the GNU Affero General Public License.
+
+  Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work.  The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+  14. Revised Versions of this License.
+
+  The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+  Each version is given a distinguishing version number.  If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation.  If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+  If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+  Later license versions may give you additional or different
+permissions.  However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+  15. Disclaimer of Warranty.
+
+  THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW.  EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU.  SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. Limitation of Liability.
+
+  IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+  17. Interpretation of Sections 15 and 16.
+
+  If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+                     END OF TERMS AND CONDITIONS
+
+            How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+  If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+    <program>  Copyright (C) <year>  <name of author>
+    This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+  You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<https://www.gnu.org/licenses/>.
+
+  The GNU General Public License does not permit incorporating your program
+into proprietary programs.  If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.  But first, please read
+<https://www.gnu.org/licenses/why-not-lgpl.html>.
--- a/annotator/clip/init.py
+++ b/annotator/clip/init.py
@ -1,39 +0,0 @@
-import torch
-from transformers import CLIPProcessor, CLIPVisionModel
-from modules import devices
-import os
-from annotator.annotator_path import clip_vision_path
-
-
-remote_model_path = "https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin"
-clip_path = clip_vision_path
-print(f'ControlNet ClipVision location: {clip_path}')
-
-clip_proc = None
-clip_vision_model = None
-
-
-def apply_clip(img):
-    global clip_proc, clip_vision_model
-    
-    if clip_vision_model is None:
-        modelpath = os.path.join(clip_path, 'pytorch_model.bin')
-        if not os.path.exists(modelpath):
-            from basicsr.utils.download_util import load_file_from_url
-            load_file_from_url(remote_model_path, model_dir=clip_path)
-
-        clip_proc = CLIPProcessor.from_pretrained(clip_path)
-        clip_vision_model = CLIPVisionModel.from_pretrained(clip_path)
-
-    with torch.no_grad():
-        clip_vision_model = clip_vision_model.to(devices.get_device_for("controlnet"))
-        style_for_clip = clip_proc(images=img, return_tensors="pt")['pixel_values']
-        style_feat = clip_vision_model(style_for_clip.to(devices.get_device_for("controlnet")))['last_hidden_state']
-
-    return style_feat
-
-
-def unload_clip_model():
-    global clip_proc, clip_vision_model
-    if clip_vision_model is not None:
-        clip_vision_model.cpu()
--- a/annotator/clip_vision/config.json
+++ b/annotator/clip_vision/config.json
@ -1,171 +0,0 @@
-{
-  "_name_or_path": "clip-vit-large-patch14/",
-  "architectures": [
-    "CLIPModel"
-  ],
-  "initializer_factor": 1.0,
-  "logit_scale_init_value": 2.6592,
-  "model_type": "clip",
-  "projection_dim": 768,
-  "text_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "max_position_embeddings": 77,
-    "min_length": 0,
-    "model_type": "clip_text_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim" : 768,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.16.0.dev0",
-    "use_bfloat16": false,
-    "vocab_size": 49408
-  },
-  "text_config_dict": {
-    "hidden_size": 768,
-    "intermediate_size": 3072,
-    "num_attention_heads": 12,
-    "num_hidden_layers": 12,
-    "projection_dim": 768
-  },
-  "torch_dtype": "float32",
-  "transformers_version": null,
-  "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "quick_gelu",
-    "hidden_size": 1024,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_factor": 1.0,
-    "initializer_range": 0.02,
-    "intermediate_size": 4096,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "clip_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 24,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 14,
-    "prefix": null,
-    "problem_type": null,
-    "projection_dim" : 768,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.16.0.dev0",
-    "use_bfloat16": false
-  },
-  "vision_config_dict": {
-    "hidden_size": 1024,
-    "intermediate_size": 4096,
-    "num_attention_heads": 16,
-    "num_hidden_layers": 24,
-    "patch_size": 14,
-    "projection_dim": 768
-  }
-}
--- a/annotator/clip_vision/merges.txt
+++ b/annotator/clip_vision/merges.txt
--- a/annotator/clip_vision/preprocessor_config.json
+++ b/annotator/clip_vision/preprocessor_config.json
@ -1,19 +0,0 @@
-{
-  "crop_size": 224,
-  "do_center_crop": true,
-  "do_normalize": true,
-  "do_resize": true,
-  "feature_extractor_type": "CLIPFeatureExtractor",
-  "image_mean": [
-    0.48145466,
-    0.4578275,
-    0.40821073
-  ],
-  "image_std": [
-    0.26862954,
-    0.26130258,
-    0.27577711
-  ],
-  "resample": 3,
-  "size": 224
-}
--- a/annotator/clip_vision/tokenizer.json
+++ b/annotator/clip_vision/tokenizer.json
--- a/annotator/clip_vision/tokenizer_config.json
+++ b/annotator/clip_vision/tokenizer_config.json
@ -1,34 +0,0 @@
-{
-    "unk_token": {
-        "content": "<|endoftext|>",
-        "single_word": false,
-        "lstrip": false,
-        "rstrip": false,
-        "normalized": true,
-        "__type": "AddedToken"
-    },
-    "bos_token": {
-        "content": "<|startoftext|>",
-        "single_word": false,
-        "lstrip": false,
-        "rstrip": false,
-        "normalized": true,
-        "__type": "AddedToken"
-    },
-    "eos_token": {
-        "content": "<|endoftext|>",
-        "single_word": false,
-        "lstrip": false,
-        "rstrip": false,
-        "normalized": true,
-        "__type": "AddedToken"
-    },
-    "pad_token": "<|endoftext|>",
-    "add_prefix_space": false,
-    "errors": "replace",
-    "do_lower_case": true,
-    "name_or_path": "openai/clip-vit-base-patch32",
-    "model_max_length": 77,
-    "special_tokens_map_file": "./special_tokens_map.json",
-    "tokenizer_class": "CLIPTokenizer"
-}
--- a/annotator/clip_vision/vocab.json
+++ b/annotator/clip_vision/vocab.json
--- a/annotator/clipvision/init.py
+++ b/annotator/clipvision/init.py
@ -0,0 +1,123 @@
+import os
+import torch
+
+from modules import devices
+from modules.modelloader import load_file_from_url
+from annotator.annotator_path import models_path
+from transformers import CLIPVisionModelWithProjection, CLIPVisionConfig, CLIPImageProcessor, modeling_utils
+
+
+config_clip_g = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1664,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 8192,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 48,
+  "patch_size": 14,
+  "projection_dim": 1280,
+  "torch_dtype": "float32"
+}
+
+config_clip_h = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "gelu",
+  "hidden_size": 1280,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 5120,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 32,
+  "patch_size": 14,
+  "projection_dim": 1024,
+  "torch_dtype": "float32"
+}
+
+config_clip_vitl = {
+  "attention_dropout": 0.0,
+  "dropout": 0.0,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 1024,
+  "image_size": 224,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "model_type": "clip_vision_model",
+  "num_attention_heads": 16,
+  "num_channels": 3,
+  "num_hidden_layers": 24,
+  "patch_size": 14,
+  "projection_dim": 768,
+  "torch_dtype": "float32"
+}
+
+configs = {
+    'clip_g': config_clip_g,
+    'clip_h': config_clip_h,
+    'clip_vitl': config_clip_vitl,
+}
+
+downloads = {
+    'clip_vitl': 'https://huggingface.co/openai/clip-vit-large-patch14/resolve/main/pytorch_model.bin',
+    'clip_g': 'https://huggingface.co/lllyasviel/Annotators/resolve/main/clip_g.pth',
+    'clip_h': 'https://huggingface.co/h94/IP-Adapter/resolve/main/models/image_encoder/pytorch_model.bin'
+}
+
+
+class ClipVisionDetector:
+    def __init__(self, config):
+        assert config in downloads
+        self.download_link = downloads[config]
+        self.model_path = os.path.join(models_path, 'clip_vision')
+        self.file_name = config + '.pth'
+        self.config = configs[config]
+        self.device = devices.get_device_for("controlnet")
+        os.makedirs(self.model_path, exist_ok=True)
+        file_path = os.path.join(self.model_path, self.file_name)
+        if not os.path.exists(file_path):
+            load_file_from_url(url=self.download_link, model_dir=self.model_path, file_name=self.file_name)
+        config = CLIPVisionConfig(**self.config)
+        self.model = CLIPVisionModelWithProjection(config)
+        self.processor = CLIPImageProcessor(crop_size=224,
+                                            do_center_crop=True,
+                                            do_convert_rgb=True,
+                                            do_normalize=True,
+                                            do_resize=True,
+                                            image_mean=[0.48145466, 0.4578275, 0.40821073],
+                                            image_std=[0.26862954, 0.26130258, 0.27577711],
+                                            resample=3,
+                                            size=224)
+
+        sd = torch.load(file_path, map_location=torch.device('cpu'))
+        self.model.load_state_dict(sd, strict=False)
+        del sd
+
+        self.model.eval()
+        self.model.cpu()
+
+    def unload_model(self):
+        if self.model is not None:
+            self.model.to('meta')
+
+    def __call__(self, input_image):
+        with torch.no_grad():
+            clip_vision_model = self.model.cpu()
+            feat = self.processor(images=input_image, return_tensors="pt")
+            feat['pixel_values'] = feat['pixel_values'].cpu()
+            result = clip_vision_model(**feat, output_hidden_states=True)
+            result['hidden_states'] = [v.to(devices.get_device_for("controlnet")) for v in result['hidden_states']]
+            result = {k: v.to(devices.get_device_for("controlnet")) if isinstance(v, torch.Tensor) else v for k, v in result.items()}
+        return result
--- a/annotator/shuffle/init.py
+++ b/annotator/shuffle/init.py
@ -1,8 +1,6 @@
-import random
-
 import cv2
 import numpy as np
-from annotator.util import make_noise_disk, img2mask
+from annotator.util import make_noise_disk


 class ContentShuffleDetector:
@ -18,57 +16,3 @@ class ContentShuffleDetector:
        y = make_noise_disk(h, w, 1, f) * float(H - 1)
        flow = np.concatenate([x, y], axis=2).astype(np.float32)
        return cv2.remap(img, flow, None, cv2.INTER_LINEAR)
-
-
-class ColorShuffleDetector:
-    def __call__(self, img):
-        H, W, C = img.shape
-        F = np.random.randint(64, 384)
-        A = make_noise_disk(H, W, 3, F)
-        B = make_noise_disk(H, W, 3, F)
-        C = (A + B) / 2.0
-        A = (C + (A - C) * 3.0).clip(0, 1)
-        B = (C + (B - C) * 3.0).clip(0, 1)
-        L = img.astype(np.float32) / 255.0
-        Y = A * L + B * (1 - L)
-        Y -= np.min(Y, axis=(0, 1), keepdims=True)
-        Y /= np.maximum(np.max(Y, axis=(0, 1), keepdims=True), 1e-5)
-        Y *= 255.0
-        return Y.clip(0, 255).astype(np.uint8)
-
-
-class GrayDetector:
-    def __call__(self, img):
-        eps = 1e-5
-        X = img.astype(np.float32)
-        r, g, b = X[:, :, 0], X[:, :, 1], X[:, :, 2]
-        kr, kg, kb = [random.random() + eps for _ in range(3)]
-        ks = kr + kg + kb
-        kr /= ks
-        kg /= ks
-        kb /= ks
-        Y = r * kr + g * kg + b * kb
-        Y = np.stack([Y] * 3, axis=2)
-        return Y.clip(0, 255).astype(np.uint8)
-
-
-class DownSampleDetector:
-    def __call__(self, img, level=3, k=16.0):
-        h = img.astype(np.float32)
-        for _ in range(level):
-            h += np.random.normal(loc=0.0, scale=k, size=h.shape)
-            h = cv2.pyrDown(h)
-        for _ in range(level):
-            h = cv2.pyrUp(h)
-            h += np.random.normal(loc=0.0, scale=k, size=h.shape)
-        return h.clip(0, 255).astype(np.uint8)
-
-
-class Image2MaskShuffleDetector:
-    def __init__(self, resolution=(640, 512)):
-        self.H, self.W = resolution
-
-    def __call__(self, img):
-        m = img2mask(img, self.H, self.W)
-        m *= 255.0
-        return m.clip(0, 255).astype(np.uint8)
--- a/annotator/util.py
+++ b/annotator/util.py
@ -60,20 +60,3 @@ def safe_step(x, step=2):
    y = x.astype(np.float32) * float(step + 1)
    y = y.astype(np.int32).astype(np.float32) / float(step)
    return y
-
-
-def img2mask(img, H, W, low=10, high=90):
-    assert img.ndim == 3 or img.ndim == 2
-    assert img.dtype == np.uint8
-
-    if img.ndim == 3:
-        y = img[:, :, random.randrange(0, img.shape[2])]
-    else:
-        y = img
-
-    y = cv2.resize(y, (W, H), interpolation=cv2.INTER_CUBIC)
-
-    if random.uniform(0, 1) < 0.5:
-        y = 255 - y
-
-    return y < np.percentile(y, random.randrange(low, high))
--- a/internal_controlnet/external_code.py
+++ b/internal_controlnet/external_code.py
@ -283,7 +283,7 @@ def get_max_models_num():
    Fetch the maximum number of allowed ControlNet models.
    """

-    max_models_num = shared.opts.data.get("control_net_max_models_num", 1)
+    max_models_num = shared.opts.data.get("control_net_unit_count", 3)
    return max_models_num


@ -362,7 +362,7 @@ def update_cn_script_in_place(
        return

    # fill in remaining parameters to satisfy max models, just in case script needs it.
-    max_models = shared.opts.data.get("control_net_max_models_num", 1)
+    max_models = shared.opts.data.get("control_net_unit_count", 3)
    cn_units = cn_units + [ControlNetUnit(enabled=False)] * max(max_models - len(cn_units), 0)

    cn_script_args_diff = 0
--- a/models/cldm_v15.yaml
+++ b/models/cldm_v15.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/cldm_v21.yaml
+++ b/models/cldm_v21.yaml
@ -1,85 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        use_checkpoint: True
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_head_channels: 64 # need to fix for flash-attn
-        use_spatial_transformer: True
-        use_linear_in_transformer: True
-        transformer_depth: 1
-        context_dim: 1024
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          #attn_type: "vanilla-xformers"
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
-      params:
-        freeze: True
-        layer: "penultimate"
--- a/models/control_sd15_canny.yaml
+++ b/models/control_sd15_canny.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_depth.yaml
+++ b/models/control_sd15_depth.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_hed.yaml
+++ b/models/control_sd15_hed.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_mlsd.yaml
+++ b/models/control_sd15_mlsd.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_normal.yaml
+++ b/models/control_sd15_normal.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_openpose.yaml
+++ b/models/control_sd15_openpose.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_scribble.yaml
+++ b/models/control_sd15_scribble.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_sd15_seg.yaml
+++ b/models/control_sd15_seg.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11e_sd15_ip2p.yaml
+++ b/models/control_v11e_sd15_ip2p.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11e_sd15_shuffle.yaml
+++ b/models/control_v11e_sd15_shuffle.yaml
@ -1,80 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-    global_average_pooling: True
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11f1e_sd15_tile.yaml
+++ b/models/control_v11f1e_sd15_tile.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11f1p_sd15_depth.yaml
+++ b/models/control_v11f1p_sd15_depth.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_canny.yaml
+++ b/models/control_v11p_sd15_canny.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_inpaint.yaml
+++ b/models/control_v11p_sd15_inpaint.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_lineart.yaml
+++ b/models/control_v11p_sd15_lineart.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_mlsd.yaml
+++ b/models/control_v11p_sd15_mlsd.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_normalbae.yaml
+++ b/models/control_v11p_sd15_normalbae.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_openpose.yaml
+++ b/models/control_v11p_sd15_openpose.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_scribble.yaml
+++ b/models/control_v11p_sd15_scribble.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_seg.yaml
+++ b/models/control_v11p_sd15_seg.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15_softedge.yaml
+++ b/models/control_v11p_sd15_softedge.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/control_v11p_sd15s2_lineart_anime.yaml
+++ b/models/control_v11p_sd15s2_lineart_anime.yaml
@ -1,79 +0,0 @@
-model:
-  target: cldm.cldm.ControlLDM
-  params:
-    linear_start: 0.00085
-    linear_end: 0.0120
-    num_timesteps_cond: 1
-    log_every_t: 200
-    timesteps: 1000
-    first_stage_key: "jpg"
-    cond_stage_key: "txt"
-    control_key: "hint"
-    image_size: 64
-    channels: 4
-    cond_stage_trainable: false
-    conditioning_key: crossattn
-    monitor: val/loss_simple_ema
-    scale_factor: 0.18215
-    use_ema: False
-    only_mid_control: False
-
-    control_stage_config:
-      target: cldm.cldm.ControlNet
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        hint_channels: 3
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    unet_config:
-      target: cldm.cldm.ControlledUnetModel
-      params:
-        image_size: 32 # unused
-        in_channels: 4
-        out_channels: 4
-        model_channels: 320
-        attention_resolutions: [ 4, 2, 1 ]
-        num_res_blocks: 2
-        channel_mult: [ 1, 2, 4, 4 ]
-        num_heads: 8
-        use_spatial_transformer: True
-        transformer_depth: 1
-        context_dim: 768
-        use_checkpoint: True
-        legacy: False
-
-    first_stage_config:
-      target: ldm.models.autoencoder.AutoencoderKL
-      params:
-        embed_dim: 4
-        monitor: val/rec_loss
-        ddconfig:
-          double_z: true
-          z_channels: 4
-          resolution: 256
-          in_channels: 3
-          out_ch: 3
-          ch: 128
-          ch_mult:
-          - 1
-          - 2
-          - 4
-          - 4
-          num_res_blocks: 2
-          attn_resolutions: []
-          dropout: 0.0
-        lossconfig:
-          target: torch.nn.Identity
-
-    cond_stage_config:
-      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/models/image_adapter_v14.yaml
+++ b/models/image_adapter_v14.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/put_controlnet_models_here.txt
+++ b/models/put_controlnet_models_here.txt
@ -0,0 +1 @@
+put_controlnet_models_here
--- a/models/sketch_adapter_v14.yaml
+++ b/models/sketch_adapter_v14.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 64
-    use_conv: false
--- a/models/t2iadapter_canny_sd14v1.yaml
+++ b/models/t2iadapter_canny_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 64
-    use_conv: false
--- a/models/t2iadapter_canny_sd15v2.yaml
+++ b/models/t2iadapter_canny_sd15v2.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 64
-    use_conv: false
--- a/models/t2iadapter_color_sd14v1.yaml
+++ b/models/t2iadapter_color_sd14v1.yaml
@ -1,6 +0,0 @@
-model:
-  target: scripts.adapter.Adapter_light
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 4
-    cin: 192
--- a/models/t2iadapter_depth_sd14v1.yaml
+++ b/models/t2iadapter_depth_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/t2iadapter_depth_sd15v2.yaml
+++ b/models/t2iadapter_depth_sd15v2.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/t2iadapter_keypose_sd14v1.yaml
+++ b/models/t2iadapter_keypose_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/t2iadapter_openpose_sd14v1.yaml
+++ b/models/t2iadapter_openpose_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/t2iadapter_seg_sd14v1.yaml
+++ b/models/t2iadapter_seg_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/models/t2iadapter_sketch_sd14v1.yaml
+++ b/models/t2iadapter_sketch_sd14v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 64
-    use_conv: false
--- a/models/t2iadapter_sketch_sd15v2.yaml
+++ b/models/t2iadapter_sketch_sd15v2.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 64
-    use_conv: false
--- a/models/t2iadapter_style_sd14v1.yaml
+++ b/models/t2iadapter_style_sd14v1.yaml
@ -1,8 +0,0 @@
-model:
-  target: scripts.adapter.StyleAdapter
-  params:
-    width: 1024
-    context_dim: 768
-    num_head: 8
-    n_layes: 3
-    num_token: 8
--- a/models/t2iadapter_zoedepth_sd15v1.yaml
+++ b/models/t2iadapter_zoedepth_sd15v1.yaml
@ -1,9 +0,0 @@
-model:
-  target: tencentarc.t21_adapter
-  params:
-    channels: [320, 640, 1280, 1280]
-    nums_rb: 2
-    ksize: 1
-    sk: true
-    cin: 192
-    use_conv: false
--- a/scripts/adapter.py
+++ b/scripts/adapter.py
@ -1,16 +1,11 @@
-
-
 import torch
 import torch.nn as nn
-import importlib
 from collections import OrderedDict

 from omegaconf import OmegaConf
 from copy import deepcopy
 from modules import devices, lowvram, shared, scripts
 cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x)
-from ldm.modules.diffusionmodules.util import timestep_embedding
-from ldm.modules.diffusionmodules.openaimodel import UNetModel


 class TorchHijackForUnet:
@ -50,42 +45,12 @@ def align(hint, size):
    return hint


-def get_node_name(name, parent_name):
-    if len(name) <= len(parent_name):
-        return False, ''
-    p = name[:len(parent_name)]
-    if p != parent_name:
-        return False, ''
-    return True, name[len(parent_name):]
-    
-    
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-
-
 class PlugableAdapter(nn.Module):
-    def __init__(self, state_dict, config_path, lowvram=False, base_model=None) -> None:
+    def __init__(self, control_model) -> None:
        super().__init__()
-        self.config = OmegaConf.load(config_path)
-        model = Adapter
-        try:
-            self.target = self.config.model.target
-            model = get_obj_from_str(self.config.model.target)
-        except ImportError:
-            pass
-        
-        self.control_model = model(**self.config.model.params)       
-        self.control_model.load_state_dict(state_dict)
-        self.lowvram = lowvram 
+        self.control_model = control_model
        self.control = None
        self.hint_cond = None
-        
-        if not self.lowvram:
-            self.control_model.to(devices.get_device_for("controlnet"))
            
    def reset(self):
        self.control = None
@ -98,12 +63,21 @@ class PlugableAdapter(nn.Module):
        self.hint_cond = cond_cast_unet(hint)
        hint_in = cond_cast_unet(hint)
        
-        if hasattr(self.control_model, 'conv_in') and self.control_model.conv_in.in_channels == 64:
+        if hasattr(self.control_model, 'conv_in') and \
+                (self.control_model.conv_in.in_channels == 64 or self.control_model.conv_in.in_channels == 256):
            hint_in = hint_in[:, 0:1, :, :]

        self.control = self.control_model(hint_in)
        return deepcopy(self.control)

+    def aggressive_lowvram(self):
+        self.to(devices.get_device_for("controlnet"))
+        return
+
+    def fullvram(self):
+        self.to(devices.get_device_for("controlnet"))
+        return
+

 def conv_nd(dims, *args, **kwargs):
    """
@ -236,34 +210,76 @@ class ResnetBlock(nn.Module):


 class Adapter(nn.Module):
-    def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True):
+    def __init__(self, channels=[320, 640, 1280, 1280], nums_rb=3, cin=64, ksize=3, sk=False, use_conv=True, is_sdxl=True):
        super(Adapter, self).__init__()
-        self.unshuffle = nn.PixelUnshuffle(8)
+
+        if is_sdxl:
+            self.pixel_shuffle = 16
+            downsample_avoided = [1]
+            downsample_layers = [2]
+        else:
+            self.pixel_shuffle = 8
+            downsample_avoided = []
+            downsample_layers = [3, 2, 1]
+
+        self.input_channels = cin // (self.pixel_shuffle * self.pixel_shuffle)
        self.channels = channels
        self.nums_rb = nums_rb
        self.body = []
+
+        self.unshuffle = nn.PixelUnshuffle(self.pixel_shuffle)
+
        for i in range(len(channels)):
-            for j in range(nums_rb):
-                if (i!=0) and (j==0):
-                    self.body.append(ResnetBlock(channels[i-1], channels[i], down=True, ksize=ksize, sk=sk, use_conv=use_conv))
-                else:
-                    self.body.append(ResnetBlock(channels[i], channels[i], down=False, ksize=ksize, sk=sk, use_conv=use_conv))
+            for r in range(nums_rb):
+
+                if i in downsample_layers and r == 0:
+                    self.body.append(ResnetBlock(
+                        channels[i - 1],
+                        channels[i],
+                        down=True,
+                        ksize=ksize,
+                        sk=sk,
+                        use_conv=use_conv))
+                    continue
+
+                if i in downsample_avoided and r == 0:
+                    self.body.append(ResnetBlock(
+                        channels[i - 1],
+                        channels[i],
+                        down=False,
+                        ksize=ksize,
+                        sk=sk,
+                        use_conv=use_conv))
+                    continue
+
+                self.body.append(ResnetBlock(
+                    channels[i],
+                    channels[i],
+                    down=False,
+                    ksize=ksize,
+                    sk=sk,
+                    use_conv=use_conv
+                ))
+
        self.body = nn.ModuleList(self.body)
        self.conv_in = nn.Conv2d(cin, channels[0], 3, 1, 1)

    def forward(self, x):
-        # unshuffle
+        self.to(x.device)
+
        x = self.unshuffle(x)
-        # extract features
-        features = []
+        hs = []
+
        x = self.conv_in(x)
        for i in range(len(self.channels)):
-            for j in range(self.nums_rb):
-                idx = i*self.nums_rb +j
+            for r in range(self.nums_rb):
+                idx = i * self.nums_rb + r
                x = self.body[idx](x)
-            features.append(x)
+            hs.append(x)
+
+        self.to('cpu')
+        return hs

-        return features

 class LayerNorm(nn.LayerNorm):
    """Subclass torch's LayerNorm to handle fp16."""
--- a/scripts/api.py
+++ b/scripts/api.py
@ -77,7 +77,7 @@ def controlnet_api(_: gr.Blocks, app: FastAPI):
    @app.get("/controlnet/settings")
    async def settings():
        max_models_num = external_code.get_max_models_num()
-        return {"control_net_max_models_num": max_models_num}
+        return {"control_net_unit_count": max_models_num}

    cached_cn_preprocessors = global_state.cache_preprocessors(
        global_state.cn_preprocessor_modules
--- a/scripts/cldm.py
+++ b/scripts/cldm.py
@ -1,108 +1,67 @@
 import torch
 import torch.nn as nn
-from omegaconf import OmegaConf
-from modules import devices, shared

-cond_cast_unet = getattr(devices, 'cond_cast_unet', lambda x: x)
-
-from ldm.util import exists
-from ldm.modules.attention import SpatialTransformer
-from ldm.modules.diffusionmodules.util import conv_nd, linear, zero_module, timestep_embedding
-from ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock
+from modules import devices


-class TorchHijackForUnet:
-    """
-    This is torch, but with cat that resizes tensors to appropriate dimensions if they do not match;
-    this makes it possible to create pictures with dimensions that are multiples of 8 rather than 64
-    """
-
-    def __getattr__(self, item):
-        if item == 'cat':
-            return self.cat
-
-        if hasattr(torch, item):
-            return getattr(torch, item)
-
-        raise AttributeError("'{}' object has no attribute '{}'".format(type(self).__name__, item))
-
-    def cat(self, tensors, *args, **kwargs):
-        if len(tensors) == 2:
-            a, b = tensors
-            if a.shape[-2:] != b.shape[-2:]:
-                a = torch.nn.functional.interpolate(a, b.shape[-2:], mode="nearest")
-
-            tensors = (a, b)
-
-        return torch.cat(tensors, *args, **kwargs)
-
-
-th = TorchHijackForUnet()
-
-
-def align(hint, size):
-    b, c, h1, w1 = hint.shape
-    h, w = size
-    if h != h1 or w != w1:
-         hint = th.nn.functional.interpolate(hint, size=size, mode="nearest")
-    return hint
-
-
-def get_node_name(name, parent_name):
-    if len(name) <= len(parent_name):
-        return False, ''
-    p = name[:len(parent_name)]
-    if p != parent_name:
-        return False, ''
-    return True, name[len(parent_name):]
+try:
+    from sgm.modules.diffusionmodules.openaimodel import conv_nd, linear, zero_module, timestep_embedding, \
+        TimestepEmbedSequential, ResBlock, Downsample, SpatialTransformer, exists
+    using_sgm = True
+except:
+    from ldm.modules.diffusionmodules.openaimodel import conv_nd, linear, zero_module, timestep_embedding, \
+        TimestepEmbedSequential, ResBlock, Downsample, SpatialTransformer, exists
+    using_sgm = False


 class PlugableControlModel(nn.Module):
-    def __init__(self, state_dict, config_path, lowvram=False, base_model=None) -> None:
+    def __init__(self, config, state_dict=None):
        super().__init__()
-        self.config = OmegaConf.load(config_path)        
-        self.control_model = ControlNet(**self.config.model.params.control_stage_config.params)
-            
-        if any([k.startswith("control_model.") for k, v in state_dict.items()]):
-            if 'difference' in state_dict and base_model is not None:
-                print('We will stop supporting diff models soon because of its lack of robustness.')
-                print('Please begin to use official models as soon as possible.')
+        self.config = config
+        self.control_model = ControlNet(**self.config).cpu()
+        if state_dict is not None:
+            self.control_model.load_state_dict(state_dict, strict=False)
+        self.gpu_component = None
+        self.is_control_lora = False

-                unet_state_dict = base_model.state_dict()
-                unet_state_dict_keys = unet_state_dict.keys()
-                final_state_dict = {}
-                counter = 0
-                for key in state_dict.keys():
-                    if not key.startswith("control_model."):
-                        continue
-                    p = state_dict[key]
-                    is_control, node_name = get_node_name(key, 'control_')
-                    key_name = node_name.replace("model.", "") if is_control else key
-                    if key_name in unet_state_dict_keys:
-                        p_new = p + unet_state_dict[key_name].clone().cpu()
-                        counter += 1
-                    else:
-                        p_new = p
-                    final_state_dict[key] = p_new
-                print(f'Diff model cloned: {counter} values')
-                state_dict = final_state_dict
-            state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items() if k.startswith("control_model.")}
-            
-        self.control_model.load_state_dict(state_dict)
-        if not lowvram:
-            self.control_model.to(devices.get_device_for("controlnet"))
-            
    def reset(self):
        pass
            
    def forward(self, *args, **kwargs):
        return self.control_model(*args, **kwargs)
+
+    def aggressive_lowvram(self):
+        self.to('cpu')
+
+        def send_me_to_gpu(module, _):
+            if self.gpu_component == module:
+                return
+
+            if self.gpu_component is not None:
+                self.gpu_component.to('cpu')
+
+            module.to(devices.get_device_for("controlnet"))
+            self.gpu_component = module
+
+        self.control_model.time_embed.register_forward_pre_hook(send_me_to_gpu)
+        self.control_model.input_hint_block.register_forward_pre_hook(send_me_to_gpu)
+        self.control_model.label_emb.register_forward_pre_hook(send_me_to_gpu)
+        for m in self.control_model.input_blocks:
+            m.register_forward_pre_hook(send_me_to_gpu)
+        for m in self.control_model.zero_convs:
+            m.register_forward_pre_hook(send_me_to_gpu)
+        self.control_model.middle_block.register_forward_pre_hook(send_me_to_gpu)
+        self.control_model.middle_block_out.register_forward_pre_hook(send_me_to_gpu)
+        return
+
+    def fullvram(self):
+        self.to(devices.get_device_for("controlnet"))
+        return
            

 class ControlNet(nn.Module):
    def __init__(
        self,
-        image_size,
        in_channels,
        model_channels,
        hint_channels,
@ -112,75 +71,54 @@ class ControlNet(nn.Module):
        channel_mult=(1, 2, 4, 8),
        conv_resample=True,
        dims=2,
+        num_classes=None,
        use_checkpoint=False,
-        use_fp16=False,
+        use_fp16=True,
        num_heads=-1,
        num_head_channels=-1,
        num_heads_upsample=-1,
        use_scale_shift_norm=False,
        resblock_updown=False,
-        use_new_attention_order=False,
-        use_spatial_transformer=False,    # custom transformer support
-        transformer_depth=1,              # custom transformer support
-        context_dim=None,                 # custom transformer support
-        # custom support for prediction of discrete ids into codebook of first stage vq model
+        use_spatial_transformer=True,
+        transformer_depth=1,
+        context_dim=None,
        n_embed=None,
-        legacy=True,
+        legacy=False,
        disable_self_attentions=None,
        num_attention_blocks=None,
        disable_middle_self_attn=False,
        use_linear_in_transformer=False,
+        adm_in_channels=None,
+        transformer_depth_middle=None,
+        device=None,
+        global_average_pooling=False,
    ):
-        use_fp16 = getattr(devices, 'dtype_unet', devices.dtype) == th.float16 and not getattr(shared.cmd_opts, "no_half_controlnet", False)
-            
        super().__init__()
-        if use_spatial_transformer:
-            assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...'

-        if context_dim is not None:
-            assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...'
-            from omegaconf.listconfig import ListConfig
-            if type(context_dim) == ListConfig:
-                context_dim = list(context_dim)
+        self.global_average_pooling = global_average_pooling

        if num_heads_upsample == -1:
            num_heads_upsample = num_heads

-        if num_heads == -1:
-            assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set'
-
-        if num_head_channels == -1:
-            assert num_heads != -1, 'Either num_heads or num_head_channels has to be set'
-
        self.dims = dims
-        self.image_size = image_size
        self.in_channels = in_channels
        self.model_channels = model_channels
+        if isinstance(transformer_depth, int):
+            transformer_depth = len(channel_mult) * [transformer_depth]
+        if transformer_depth_middle is None:
+            transformer_depth_middle = transformer_depth[-1]
        if isinstance(num_res_blocks, int):
            self.num_res_blocks = len(channel_mult) * [num_res_blocks]
        else:
-            if len(num_res_blocks) != len(channel_mult):
-                raise ValueError("provide num_res_blocks either as an int (globally constant) or "
-                                 "as a list/tuple (per-level) with the same length as channel_mult")
            self.num_res_blocks = num_res_blocks
-        if disable_self_attentions is not None:
-            # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not
-            assert len(disable_self_attentions) == len(channel_mult)
-        if num_attention_blocks is not None:
-            assert len(num_attention_blocks) == len(self.num_res_blocks)
-            assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(
-                len(num_attention_blocks))))
-            print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. "
-                  f"This option has LESS priority than attention_resolutions {attention_resolutions}, "
-                  f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, "
-                  f"attention will still not be set.")

        self.attention_resolutions = attention_resolutions
        self.dropout = dropout
        self.channel_mult = channel_mult
        self.conv_resample = conv_resample
+        self.num_classes = num_classes
        self.use_checkpoint = use_checkpoint
-        self.dtype = th.float16 if use_fp16 else th.float32
+        self.dtype = torch.float16 if use_fp16 else torch.float32
        self.num_heads = num_heads
        self.num_head_channels = num_head_channels
        self.num_heads_upsample = num_heads_upsample
@ -188,36 +126,54 @@ class ControlNet(nn.Module):

        time_embed_dim = model_channels * 4
        self.time_embed = nn.Sequential(
-            linear(model_channels, time_embed_dim),
+            linear(model_channels, time_embed_dim, dtype=self.dtype, device=device),
            nn.SiLU(),
-            linear(time_embed_dim, time_embed_dim),
+            linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
        )

+        if self.num_classes is not None:
+            if isinstance(self.num_classes, int):
+                self.label_emb = nn.Embedding(num_classes, time_embed_dim)
+            elif self.num_classes == "continuous":
+                print("setting up linear c_adm embedding layer")
+                self.label_emb = nn.Linear(1, time_embed_dim)
+            elif self.num_classes == "sequential":
+                assert adm_in_channels is not None
+                self.label_emb = nn.Sequential(
+                    nn.Sequential(
+                        linear(adm_in_channels, time_embed_dim, dtype=self.dtype, device=device),
+                        nn.SiLU(),
+                        linear(time_embed_dim, time_embed_dim, dtype=self.dtype, device=device),
+                    )
+                )
+            else:
+                raise ValueError()
+
        self.input_blocks = nn.ModuleList(
            [
                TimestepEmbedSequential(
-                    conv_nd(dims, in_channels, model_channels, 3, padding=1)
+                    conv_nd(dims, in_channels, model_channels, 3, padding=1, dtype=self.dtype, device=device)
                )
            ]
        )
        self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)])

        self.input_hint_block = TimestepEmbedSequential(
-            conv_nd(dims, hint_channels, 16, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 16, 16, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 16, 32, 3, padding=1, stride=2),
-            nn.SiLU(),
-            conv_nd(dims, 32, 32, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 32, 96, 3, padding=1, stride=2),
-            nn.SiLU(),
-            conv_nd(dims, 96, 96, 3, padding=1),
-            nn.SiLU(),
-            conv_nd(dims, 96, 256, 3, padding=1, stride=2),
-            nn.SiLU(),
-            zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
+                    conv_nd(dims, hint_channels, 16, 3, padding=1),
+                    nn.SiLU(),
+                    conv_nd(dims, 16, 16, 3, padding=1),
+                    nn.SiLU(),
+                    conv_nd(dims, 16, 32, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    conv_nd(dims, 32, 32, 3, padding=1),
+                    nn.SiLU(),
+                    conv_nd(dims, 32, 96, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    conv_nd(dims, 96, 96, 3, padding=1),
+                    nn.SiLU(),
+                    conv_nd(dims, 96, 256, 3, padding=1, stride=2),
+                    nn.SiLU(),
+                    zero_module(conv_nd(dims, 256, model_channels, 3, padding=1))
        )

        self._feature_size = model_channels
@ -234,7 +190,7 @@ class ControlNet(nn.Module):
                        out_channels=mult * model_channels,
                        dims=dims,
                        use_checkpoint=use_checkpoint,
-                        use_scale_shift_norm=use_scale_shift_norm,
+                        use_scale_shift_norm=use_scale_shift_norm
                    )
                ]
                ch = mult * model_channels
@ -254,14 +210,8 @@ class ControlNet(nn.Module):

                    if not exists(num_attention_blocks) or nr < num_attention_blocks[level]:
                        layers.append(
-                            AttentionBlock(
-                                ch,
-                                use_checkpoint=use_checkpoint,
-                                num_heads=num_heads,
-                                num_head_channels=dim_head,
-                                use_new_attention_order=use_new_attention_order,
-                            ) if not use_spatial_transformer else SpatialTransformer(
-                                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
+                            SpatialTransformer(
+                                ch, num_heads, dim_head, depth=transformer_depth[level], context_dim=context_dim,
                                disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer,
                                use_checkpoint=use_checkpoint
                            )
@ -282,7 +232,7 @@ class ControlNet(nn.Module):
                            dims=dims,
                            use_checkpoint=use_checkpoint,
                            use_scale_shift_norm=use_scale_shift_norm,
-                            down=True,
+                            down=True
                        )
                        if resblock_updown
                        else Downsample(
@ -311,27 +261,20 @@ class ControlNet(nn.Module):
                dropout,
                dims=dims,
                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
-            ),
-            AttentionBlock(
-                ch,
-                use_checkpoint=use_checkpoint,
-                num_heads=num_heads,
-                num_head_channels=dim_head,
-                use_new_attention_order=use_new_attention_order,
-                # always uses a self-attn
-            ) if not use_spatial_transformer else SpatialTransformer(
-                ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim,
-                disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
-                use_checkpoint=use_checkpoint
+                use_scale_shift_norm=use_scale_shift_norm
            ),
+            SpatialTransformer(  # always uses a self-attn
+                            ch, num_heads, dim_head, depth=transformer_depth_middle, context_dim=context_dim,
+                            disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer,
+                            use_checkpoint=use_checkpoint
+                        ),
            ResBlock(
                ch,
                time_embed_dim,
                dropout,
                dims=dims,
                use_checkpoint=use_checkpoint,
-                use_scale_shift_norm=use_scale_shift_norm,
+                use_scale_shift_norm=use_scale_shift_norm
            ),
        )
        self.middle_block_out = self.make_zero_conv(ch)
@ -339,24 +282,29 @@ class ControlNet(nn.Module):

    def make_zero_conv(self, channels):
        return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0)))
-    
-    def align(self, hint, h, w):
-        b, c, h1, w1 = hint.shape
-        if h != h1 or w != w1:
-            return align(hint, (h, w))
-        return hint

-    def forward(self, x, hint, timesteps, context, **kwargs):
-        t_emb = cond_cast_unet(timestep_embedding(timesteps, self.model_channels, repeat_only=False))
+    def forward(self, x, hint, timesteps, context, y=None, **kwargs):
+        original_type = x.dtype
+
+        x = x.to(self.dtype)
+        hint = hint.to(self.dtype)
+        timesteps = timesteps.to(self.dtype)
+        context = context.to(self.dtype)
+
+        if y is not None:
+            y = y.to(self.dtype)
+
+        t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False).to(self.dtype)
        emb = self.time_embed(t_emb)
-            
-        guided_hint = self.input_hint_block(cond_cast_unet(hint), emb, context)
-        outs = []
-        
-        h1, w1 = x.shape[-2:]
-        guided_hint = self.align(guided_hint, h1, w1)

-        h = x.type(self.dtype)
+        guided_hint = self.input_hint_block(hint, emb, context)
+        outs = []
+
+        if self.num_classes is not None:
+            assert y.shape[0] == x.shape[0]
+            emb = emb + self.label_emb(y)
+
+        h = x
        for module, zero_conv in zip(self.input_blocks, self.zero_convs):
            if guided_hint is not None:
                h = module(h, emb, context)
@ -369,4 +317,6 @@ class ControlNet(nn.Module):
        h = self.middle_block(h, emb, context)
        outs.append(self.middle_block_out(h, emb, context))

+        outs = [o.to(original_type) for o in outs]
+
        return outs
--- a/scripts/controlmodel_ipadapter.py
+++ b/scripts/controlmodel_ipadapter.py
@ -0,0 +1,393 @@
+import math
+import torch
+import torch.nn as nn
+
+
+# attention_channels of input, output, middle
+SD_V12_CHANNELS = [320] * 4 + [640] * 4 + [1280] * 4 + [1280] * 6 + [640] * 6 + [320] * 6 + [1280] * 2
+SD_XL_CHANNELS = [640] * 8 + [1280] * 40 + [1280] * 60 + [640] * 12 + [1280] * 20
+
+
+class ImageProjModel(torch.nn.Module):
+    """Projection Model"""
+
+    def __init__(self, cross_attention_dim=1024, clip_embeddings_dim=1024, clip_extra_context_tokens=4):
+        super().__init__()
+
+        self.cross_attention_dim = cross_attention_dim
+        self.clip_extra_context_tokens = clip_extra_context_tokens
+        self.proj = torch.nn.Linear(clip_embeddings_dim, self.clip_extra_context_tokens * cross_attention_dim)
+        self.norm = torch.nn.LayerNorm(cross_attention_dim)
+
+    def forward(self, image_embeds):
+        embeds = image_embeds
+        clip_extra_context_tokens = self.proj(embeds).reshape(-1, self.clip_extra_context_tokens,
+                                                              self.cross_attention_dim)
+        clip_extra_context_tokens = self.norm(clip_extra_context_tokens)
+        return clip_extra_context_tokens
+
+
+# Cross Attention to_k, to_v for IPAdapter
+class To_KV(torch.nn.Module):
+    def __init__(self, cross_attention_dim):
+        super().__init__()
+
+        channels = SD_XL_CHANNELS if cross_attention_dim == 2048 else SD_V12_CHANNELS
+        self.to_kvs = torch.nn.ModuleList(
+            [torch.nn.Linear(cross_attention_dim, channel, bias=False) for channel in channels])
+
+    def load_state_dict(self, state_dict):
+        # input -> output -> middle
+        for i, key in enumerate(state_dict.keys()):
+            self.to_kvs[i].weight.data = state_dict[key]
+
+
+def FeedForward(dim, mult=4):
+    inner_dim = int(dim * mult)
+    return nn.Sequential(
+        nn.LayerNorm(dim),
+        nn.Linear(dim, inner_dim, bias=False),
+        nn.GELU(),
+        nn.Linear(inner_dim, dim, bias=False),
+    )
+
+
+def reshape_tensor(x, heads):
+    bs, length, width = x.shape
+    #(bs, length, width) --> (bs, length, n_heads, dim_per_head)
+    x = x.view(bs, length, heads, -1)
+    # (bs, length, n_heads, dim_per_head) --> (bs, n_heads, length, dim_per_head)
+    x = x.transpose(1, 2)
+    # (bs, n_heads, length, dim_per_head) --> (bs*n_heads, length, dim_per_head)
+    x = x.reshape(bs, heads, length, -1)
+    return x
+
+
+class PerceiverAttention(nn.Module):
+    def __init__(self, *, dim, dim_head=64, heads=8):
+        super().__init__()
+        self.scale = dim_head**-0.5
+        self.dim_head = dim_head
+        self.heads = heads
+        inner_dim = dim_head * heads
+
+        self.norm1 = nn.LayerNorm(dim)
+        self.norm2 = nn.LayerNorm(dim)
+
+        self.to_q = nn.Linear(dim, inner_dim, bias=False)
+        self.to_kv = nn.Linear(dim, inner_dim * 2, bias=False)
+        self.to_out = nn.Linear(inner_dim, dim, bias=False)
+
+
+    def forward(self, x, latents):
+        """
+        Args:
+            x (torch.Tensor): image features
+                shape (b, n1, D)
+            latent (torch.Tensor): latent features
+                shape (b, n2, D)
+        """
+        x = self.norm1(x)
+        latents = self.norm2(latents)
+
+        b, l, _ = latents.shape
+
+        q = self.to_q(latents)
+        kv_input = torch.cat((x, latents), dim=-2)
+        k, v = self.to_kv(kv_input).chunk(2, dim=-1)
+
+        q = reshape_tensor(q, self.heads)
+        k = reshape_tensor(k, self.heads)
+        v = reshape_tensor(v, self.heads)
+
+        # attention
+        scale = 1 / math.sqrt(math.sqrt(self.dim_head))
+        weight = (q * scale) @ (k * scale).transpose(-2, -1) # More stable with f16 than dividing afterwards
+        weight = torch.softmax(weight.float(), dim=-1).type(weight.dtype)
+        out = weight @ v
+
+        out = out.permute(0, 2, 1, 3).reshape(b, l, -1)
+
+        return self.to_out(out)
+
+
+class Resampler(nn.Module):
+    def __init__(
+        self,
+        dim=1024,
+        depth=8,
+        dim_head=64,
+        heads=16,
+        num_queries=8,
+        embedding_dim=768,
+        output_dim=1024,
+        ff_mult=4,
+    ):
+        super().__init__()
+
+        self.latents = nn.Parameter(torch.randn(1, num_queries, dim) / dim**0.5)
+
+        self.proj_in = nn.Linear(embedding_dim, dim)
+
+        self.proj_out = nn.Linear(dim, output_dim)
+        self.norm_out = nn.LayerNorm(output_dim)
+
+        self.layers = nn.ModuleList([])
+        for _ in range(depth):
+            self.layers.append(
+                nn.ModuleList(
+                    [
+                        PerceiverAttention(dim=dim, dim_head=dim_head, heads=heads),
+                        FeedForward(dim=dim, mult=ff_mult),
+                    ]
+                )
+            )
+
+    def forward(self, x):
+
+        latents = self.latents.repeat(x.size(0), 1, 1)
+
+        x = self.proj_in(x)
+
+        for attn, ff in self.layers:
+            latents = attn(x, latents) + latents
+            latents = ff(latents) + latents
+
+        latents = self.proj_out(latents)
+        return self.norm_out(latents)
+
+
+class IPAdapterModel(torch.nn.Module):
+    def __init__(self, state_dict, clip_embeddings_dim, is_plus):
+        super().__init__()
+        self.device = "cpu"
+
+        # cross_attention_dim is equal to text_encoder output
+        self.cross_attention_dim = state_dict["ip_adapter"]["1.to_k_ip.weight"].shape[1]
+        self.is_plus = is_plus
+
+        if self.is_plus:
+            self.clip_extra_context_tokens = 16
+
+            self.image_proj_model = Resampler(
+                dim=self.cross_attention_dim,
+                depth=4,
+                dim_head=64,
+                heads=12,
+                num_queries=self.clip_extra_context_tokens,
+                embedding_dim=clip_embeddings_dim,
+                output_dim=self.cross_attention_dim,
+                ff_mult=4
+            )
+        else:
+            self.clip_extra_context_tokens = state_dict["image_proj"]["proj.weight"].shape[0] // self.cross_attention_dim
+
+            self.image_proj_model = ImageProjModel(
+                cross_attention_dim=self.cross_attention_dim,
+                clip_embeddings_dim=clip_embeddings_dim,
+                clip_extra_context_tokens=self.clip_extra_context_tokens
+            )
+
+        self.load_ip_adapter(state_dict)
+
+    def load_ip_adapter(self, state_dict):
+        self.image_proj_model.load_state_dict(state_dict["image_proj"])
+        self.ip_layers = To_KV(self.cross_attention_dim)
+        self.ip_layers.load_state_dict(state_dict["ip_adapter"])
+
+    @torch.inference_mode()
+    def get_image_embeds(self, clip_vision_output):
+        self.image_proj_model.cpu()
+
+        if self.is_plus:
+            cond = self.image_proj_model(clip_vision_output['hidden_states'][-2].to(device='cpu', dtype=torch.float32))
+            uncond = self.image_proj_model(torch.zeros_like(clip_vision_output['hidden_states'][-2].to(device='cpu', dtype=torch.float32)))
+            return cond, uncond
+
+        clip_image_embeds = clip_vision_output['image_embeds'].to(device='cpu', dtype=torch.float32)
+        image_prompt_embeds = self.image_proj_model(clip_image_embeds)
+        # input zero vector for unconditional.
+        uncond_image_prompt_embeds = self.image_proj_model(torch.zeros_like(clip_image_embeds))
+        return image_prompt_embeds, uncond_image_prompt_embeds
+
+
+def get_block(model, flag):
+    return {
+        'input': model.input_blocks, 'middle': [model.middle_block], 'output': model.output_blocks
+    }[flag]
+
+
+def attn_forward_hacked(self, x, context=None, **kwargs):
+    batch_size, sequence_length, inner_dim = x.shape
+    h = self.heads
+    head_dim = inner_dim // h
+
+    if context is None:
+        context = x
+
+    q = self.to_q(x)
+    k = self.to_k(context)
+    v = self.to_v(context)
+
+    del context
+
+    q, k, v = map(
+        lambda t: t.view(batch_size, -1, h, head_dim).transpose(1, 2),
+        (q, k, v),
+    )
+
+    out = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False)
+    out = out.transpose(1, 2).reshape(batch_size, -1, h * head_dim)
+
+    del k, v
+
+    for f in self.ipadapter_hacks:
+        out = out + f(self, x, q)
+
+    del q, x
+
+    return self.to_out(out)
+
+
+all_hacks = {}
+current_model = None
+
+
+def hack_blk(block, function, type):
+    if not hasattr(block, 'ipadapter_hacks'):
+        block.ipadapter_hacks = []
+
+    if len(block.ipadapter_hacks) == 0:
+        all_hacks[block] = block.forward
+        block.forward = attn_forward_hacked.__get__(block, type)
+
+    block.ipadapter_hacks.append(function)
+    return
+
+
+def set_model_attn2_replace(model, function, flag, id):
+    from ldm.modules.attention import CrossAttention
+    block = get_block(model, flag)[id][1].transformer_blocks[0].attn2
+    hack_blk(block, function, CrossAttention)
+    return
+
+
+def set_model_patch_replace(model, function, flag, id, trans_id):
+    from sgm.modules.attention import CrossAttention
+    blk = get_block(model, flag)
+    block = blk[id][1].transformer_blocks[trans_id].attn2
+    hack_blk(block, function, CrossAttention)
+    return
+
+
+def clear_all_ip_adapter():
+    global all_hacks, current_model
+    for k, v in all_hacks.items():
+        k.forward = v
+        k.ipadapter_hacks = []
+    all_hacks = {}
+    current_model = None
+    return
+
+
+class PlugableIPAdapter(torch.nn.Module):
+    def __init__(self, state_dict, clip_embeddings_dim, is_plus):
+        super().__init__()
+        self.sdxl = clip_embeddings_dim == 1280 and not is_plus
+        self.is_plus = is_plus
+        self.ipadapter = IPAdapterModel(state_dict, clip_embeddings_dim=clip_embeddings_dim, is_plus=is_plus)
+        self.disable_memory_management = True
+        self.dtype = None
+        self.weight = 1.0
+        self.cache = {}
+        self.p_start = 0.0
+        self.p_end = 1.0
+        return
+
+    def reset(self):
+        self.cache = {}
+        return
+
+    @torch.no_grad()
+    def hook(self, model, clip_vision_output, weight, start, end, dtype=torch.float32):
+        global current_model
+        current_model = model
+
+        self.p_start = start
+        self.p_end = end
+
+        self.cache = {}
+
+        self.weight = weight
+        device = torch.device('cpu')
+        self.dtype = dtype
+
+        self.ipadapter.to(device, dtype=self.dtype)
+        self.image_emb, self.uncond_image_emb = self.ipadapter.get_image_embeds(clip_vision_output)
+
+        self.image_emb = self.image_emb.to(device, dtype=self.dtype)
+        self.uncond_image_emb = self.uncond_image_emb.to(device, dtype=self.dtype)
+
+        # From https://github.com/laksjdjf/IPAdapter-ComfyUI
+        if not self.sdxl:
+            number = 0  # index of to_kvs
+            for id in [1, 2, 4, 5, 7, 8]:  # id of input_blocks that have cross attention
+                set_model_attn2_replace(model, self.patch_forward(number), "input", id)
+                number += 1
+            for id in [3, 4, 5, 6, 7, 8, 9, 10, 11]:  # id of output_blocks that have cross attention
+                set_model_attn2_replace(model, self.patch_forward(number), "output", id)
+                number += 1
+            set_model_attn2_replace(model, self.patch_forward(number), "middle", 0)
+        else:
+            number = 0
+            for id in [4, 5, 7, 8]:  # id of input_blocks that have cross attention
+                block_indices = range(2) if id in [4, 5] else range(10)  # transformer_depth
+                for index in block_indices:
+                    set_model_patch_replace(model, self.patch_forward(number), "input", id, index)
+                    number += 1
+            for id in range(6):  # id of output_blocks that have cross attention
+                block_indices = range(2) if id in [3, 4, 5] else range(10)  # transformer_depth
+                for index in block_indices:
+                    set_model_patch_replace(model, self.patch_forward(number), "output", id, index)
+                    number += 1
+            for index in range(10):
+                set_model_patch_replace(model, self.patch_forward(number), "middle", 0, index)
+                number += 1
+
+        return
+
+    def call_ip(self, number, feat, device):
+        if number in self.cache:
+            return self.cache[number]
+        else:
+            ip = self.ipadapter.ip_layers.to_kvs[number](feat).to(device)
+            self.cache[number] = ip
+            return ip
+
+    @torch.no_grad()
+    def patch_forward(self, number):
+        @torch.no_grad()
+        def forward(attn_blk, x, q):
+            batch_size, sequence_length, inner_dim = x.shape
+            h = attn_blk.heads
+            head_dim = inner_dim // h
+
+            current_sampling_percent = getattr(current_model, 'current_sampling_percent', 0.5)
+            if current_sampling_percent < self.p_start or current_sampling_percent > self.p_end:
+                return 0
+
+            cond_mark = current_model.cond_mark[:, :, :, 0].to(self.image_emb)
+            cond_uncond_image_emb = self.image_emb * cond_mark + self.uncond_image_emb * (1 - cond_mark)
+            ip_k = self.call_ip(number * 2, cond_uncond_image_emb, device=q.device)
+            ip_v = self.call_ip(number * 2 + 1, cond_uncond_image_emb, device=q.device)
+
+            ip_k, ip_v = map(
+                lambda t: t.view(batch_size, -1, h, head_dim).transpose(1, 2),
+                (ip_k, ip_v),
+            )
+
+            ip_out = torch.nn.functional.scaled_dot_product_attention(q, ip_k, ip_v, attn_mask=None, dropout_p=0.0, is_causal=False)
+            ip_out = ip_out.transpose(1, 2).reshape(batch_size, -1, h * head_dim)
+
+            return ip_out * self.weight
+        return forward
--- a/scripts/controlnet.py
+++ b/scripts/controlnet.py
@ -12,12 +12,13 @@ import gradio as gr

 from einops import rearrange
 from scripts import global_state, hook, external_code, processor, batch_hijack, controlnet_version, utils
-from scripts.controlnet_ui import controlnet_ui_group
-from scripts.cldm import PlugableControlModel
+from scripts.controlnet_lora import bind_control_lora, unbind_control_lora
 from scripts.processor import *
-from scripts.adapter import PlugableAdapter
+from scripts.adapter import Adapter, StyleAdapter, Adapter_light
+from scripts.controlnet_lllite import PlugableControlLLLite, clear_all_lllite
+from scripts.controlmodel_ipadapter import PlugableIPAdapter, clear_all_ip_adapter
 from scripts.utils import load_state_dict, get_unique_axis0
-from scripts.hook import ControlParams, UnetHook, ControlModelType
+from scripts.hook import ControlParams, UnetHook, ControlModelType, HackedImageRNG
 from scripts.controlnet_ui.controlnet_ui_group import ControlNetUiGroup, UiControlNetUnit
 from scripts.logging import logger
 from modules.processing import StableDiffusionProcessingImg2Img, StableDiffusionProcessingTxt2Img
@ -32,6 +33,8 @@ from pathlib import Path
 from PIL import Image, ImageFilter, ImageOps
 from scripts.lvminthin import lvmin_thin, nake_nms
 from scripts.processor import model_free_preprocessors
+from scripts.controlnet_model_guess import build_model_by_guess
+

 gradio_compat = True
 try:
@ -49,6 +52,11 @@ gradio_tempfile_path = os.path.join(tempfile.gettempdir(), 'gradio')
 os.makedirs(gradio_tempfile_path, exist_ok=True)


+def clear_all_secondary_control_models():
+    clear_all_lllite()
+    clear_all_ip_adapter()
+
+
 def find_closest_lora_model_name(search: str):
    if not search:
        return None
@ -228,6 +236,7 @@ class Script(scripts.Script, metaclass=(
        self.enabled_units = []
        self.detected_map = []
        self.post_processors = []
+        self.noise_modifier = None
        batch_hijack.instance.process_batch_callbacks.append(self.batch_tab_process)
        batch_hijack.instance.process_batch_each_callbacks.append(self.batch_tab_process_each)
        batch_hijack.instance.postprocess_batch_each_callbacks.insert(0, self.batch_tab_postprocess_each)
@ -266,7 +275,7 @@ class Script(scripts.Script, metaclass=(
        infotext = Infotext()
        
        controls = ()
-        max_models = shared.opts.data.get("control_net_max_models_num", 1)
+        max_models = shared.opts.data.get("control_net_unit_count", 3)
        elem_id_tabname = ("img2img" if is_img2img else "txt2img") + "_controlnet"
        with gr.Group(elem_id=elem_id_tabname):
            with gr.Accordion(f"ControlNet {controlnet_version.version_flag}", open = False, elem_id="controlnet"):
@ -297,7 +306,7 @@ class Script(scripts.Script, metaclass=(
        devices.torch_gc()

    @staticmethod
-    def load_control_model(p, unet, model, lowvram):
+    def load_control_model(p, unet, model):
        if model in Script.model_cache:
            logger.info(f"Loading model from cache: {model}")
            return Script.model_cache[model]
@ -308,7 +317,7 @@ class Script(scripts.Script, metaclass=(
            gc.collect()
            devices.torch_gc()

-        model_net = Script.build_control_model(p, unet, model, lowvram)
+        model_net = Script.build_control_model(p, unet, model)

        if shared.opts.data.get("control_net_model_cache_size", 2) > 0:
            Script.model_cache[model] = model_net
@ -316,7 +325,7 @@ class Script(scripts.Script, metaclass=(
        return model_net

    @staticmethod
-    def build_control_model(p, unet, model, lowvram):
+    def build_control_model(p, unet, model):
        if model is None or model == 'None':
            raise RuntimeError(f"You have not selected any ControlNet Model.")

@ -337,67 +346,8 @@ class Script(scripts.Script, metaclass=(

        logger.info(f"Loading model: {model}")
        state_dict = load_state_dict(model_path)
-        network_module = PlugableControlModel
-        network_config = shared.opts.data.get("control_net_model_config", global_state.default_conf)
-        if not os.path.isabs(network_config):
-            network_config = os.path.join(global_state.script_dir, network_config)
-
-        if any([k.startswith("body.") or k == 'style_embedding' for k, v in state_dict.items()]):
-            # adapter model
-            network_module = PlugableAdapter
-            network_config = shared.opts.data.get("control_net_model_adapter_config", global_state.default_conf_adapter)
-            if not os.path.isabs(network_config):
-                network_config = os.path.join(global_state.script_dir, network_config)
-
-        model_path = os.path.abspath(model_path)
-        model_stem = Path(model_path).stem
-        model_dir_name = os.path.dirname(model_path)
-
-        possible_config_filenames = [
-            os.path.join(model_dir_name, model_stem + ".yaml"),
-            os.path.join(global_state.script_dir, 'models', model_stem + ".yaml"),
-            os.path.join(model_dir_name, model_stem.replace('_fp16', '') + ".yaml"),
-            os.path.join(global_state.script_dir, 'models', model_stem.replace('_fp16', '') + ".yaml"),
-            os.path.join(model_dir_name, model_stem.replace('_diff', '') + ".yaml"),
-            os.path.join(global_state.script_dir, 'models', model_stem.replace('_diff', '') + ".yaml"),
-            os.path.join(model_dir_name, model_stem.replace('-fp16', '') + ".yaml"),
-            os.path.join(global_state.script_dir, 'models', model_stem.replace('-fp16', '') + ".yaml"),
-            os.path.join(model_dir_name, model_stem.replace('-diff', '') + ".yaml"),
-            os.path.join(global_state.script_dir, 'models', model_stem.replace('-diff', '') + ".yaml")
-        ]
-
-        override_config = possible_config_filenames[0]
-
-        for possible_config_filename in possible_config_filenames:
-            if os.path.exists(possible_config_filename):
-                override_config = possible_config_filename
-                break
-
-        if 'v11' in model_stem.lower() or 'shuffle' in model_stem.lower():
-            assert os.path.exists(override_config), f'Error: The model config {override_config} is missing. ControlNet 1.1 must have configs.'
-
-        if os.path.exists(override_config):
-            network_config = override_config
-        else:
-            # Note: This error is triggered in unittest, but not caught.
-            # TODO: Replace `print` with `logger.error`.
-            print(f'ERROR: ControlNet cannot find model config [{override_config}] \n'
-                  f'ERROR: ControlNet will use a WRONG config [{network_config}] to load your model. \n'
-                  f'ERROR: The WRONG config may not match your model. The generated results can be bad. \n'
-                  f'ERROR: You are using a ControlNet model [{model_stem}] without correct YAML config file. \n'
-                  f'ERROR: The performance of this model may be worse than your expectation. \n'
-                  f'ERROR: If this model cannot get good results, the reason is that you do not have a YAML file for the model. \n'
-                  f'Solution: Please download YAML file, or ask your model provider to provide [{override_config}] for you to download.\n'
-                  f'Hint: You can take a look at [{os.path.join(global_state.script_dir, "models")}] to find many existing YAML files.\n')
-
-        logger.info(f"Loading config: {network_config}")
-        network = network_module(
-            state_dict=state_dict,
-            config_path=network_config,
-            lowvram=lowvram,
-            base_model=unet,
-        )
-        network.to(p.sd_model.device, dtype=p.sd_model.dtype)
+        network = build_model_by_guess(state_dict, unet, model_path)
+        network.to('cpu', dtype=p.sd_model.dtype)
        logger.info(f"ControlNet model {model} loaded.")
        return network

@ -482,41 +432,44 @@ class Script(scripts.Script, metaclass=(
                inpaint_mask = x[:, :, 3]
                x = x[:, :, 0:3]

-            new_size_is_smaller = (size[0] * size[1]) < (x.shape[0] * x.shape[1])
-            new_size_is_bigger = (size[0] * size[1]) > (x.shape[0] * x.shape[1])
-            unique_color_count = len(get_unique_axis0(x.reshape(-1, x.shape[2])))
-            is_one_pixel_edge = False
-            is_binary = False
-            if unique_color_count == 2:
-                is_binary = np.min(x) < 16 and np.max(x) > 240
-                if is_binary:
-                    xc = x
-                    xc = cv2.erode(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
-                    xc = cv2.dilate(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
-                    one_pixel_edge_count = np.where(xc < x)[0].shape[0]
-                    all_edge_count = np.where(x > 127)[0].shape[0]
-                    is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count
+            if x.shape[0] != size[1] or x.shape[1] != size[0]:
+                new_size_is_smaller = (size[0] * size[1]) < (x.shape[0] * x.shape[1])
+                new_size_is_bigger = (size[0] * size[1]) > (x.shape[0] * x.shape[1])
+                unique_color_count = len(get_unique_axis0(x.reshape(-1, x.shape[2])))
+                is_one_pixel_edge = False
+                is_binary = False
+                if unique_color_count == 2:
+                    is_binary = np.min(x) < 16 and np.max(x) > 240
+                    if is_binary:
+                        xc = x
+                        xc = cv2.erode(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
+                        xc = cv2.dilate(xc, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
+                        one_pixel_edge_count = np.where(xc < x)[0].shape[0]
+                        all_edge_count = np.where(x > 127)[0].shape[0]
+                        is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count

-            if 2 < unique_color_count < 200:
-                interpolation = cv2.INTER_NEAREST
-            elif new_size_is_smaller:
-                interpolation = cv2.INTER_AREA
-            else:
-                interpolation = cv2.INTER_CUBIC  # Must be CUBIC because we now use nms. NEVER CHANGE THIS
-
-            y = cv2.resize(x, size, interpolation=interpolation)
-            if inpaint_mask is not None:
-                inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation)
-
-            if is_binary:
-                y = np.mean(y.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8)
-                if is_one_pixel_edge:
-                    y = nake_nms(y)
-                    _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-                    y = lvmin_thin(y, prunings=new_size_is_bigger)
+                if 2 < unique_color_count < 200:
+                    interpolation = cv2.INTER_NEAREST
+                elif new_size_is_smaller:
+                    interpolation = cv2.INTER_AREA
                else:
-                    _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
-                y = np.stack([y] * 3, axis=2)
+                    interpolation = cv2.INTER_CUBIC  # Must be CUBIC because we now use nms. NEVER CHANGE THIS
+
+                y = cv2.resize(x, size, interpolation=interpolation)
+                if inpaint_mask is not None:
+                    inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation)
+
+                if is_binary:
+                    y = np.mean(y.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8)
+                    if is_one_pixel_edge:
+                        y = nake_nms(y)
+                        _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                        y = lvmin_thin(y, prunings=new_size_is_bigger)
+                    else:
+                        _, y = cv2.threshold(y, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+                    y = np.stack([y] * 3, axis=2)
+            else:
+                y = x

            if inpaint_mask is not None:
                inpaint_mask = (inpaint_mask > 127).astype(np.float32) * 255.0
@ -691,12 +644,16 @@ class Script(scripts.Script, metaclass=(
        """
        sd_ldm = p.sd_model
        unet = sd_ldm.model.diffusion_model
+        self.noise_modifier = None

-        setattr(p, 'controlnet_initial_noise_modifier', None)
+        setattr(p, 'controlnet_control_loras', [])

        if self.latest_network is not None:
            # always restore (~0.05s)
-            self.latest_network.restore(unet)
+            self.latest_network.restore()
+
+        # always clear (~0.05s)
+        clear_all_secondary_control_models()

        if not batch_hijack.instance.is_batch:
            self.enabled_units = Script.get_enabled_units(p)
@ -704,11 +661,6 @@ class Script(scripts.Script, metaclass=(
        if len(self.enabled_units) == 0:
           self.latest_network = None
           return
-        
-        is_sdxl = getattr(p.sd_model, 'is_sdxl', False)
-        if is_sdxl:
-            logger.warning('ControlNet does not support SDXL -- disabling')
-            return

        detected_maps = []
        forward_params = []
@ -718,6 +670,9 @@ class Script(scripts.Script, metaclass=(
        if self.latest_model_hash != p.sd_model.sd_model_hash:
            Script.clear_control_model_cache()

+        for idx, unit in enumerate(self.enabled_units):
+            unit.module = global_state.get_module_basename(unit.module)
+
        # unload unused preproc
        module_list = [unit.module for unit in self.enabled_units]
        for key in self.unloadable:
@ -728,16 +683,20 @@ class Script(scripts.Script, metaclass=(
        for idx, unit in enumerate(self.enabled_units):
            Script.bound_check_params(unit)

-            unit.module = global_state.get_module_basename(unit.module)
            resize_mode = external_code.resize_mode_from_value(unit.resize_mode)
            control_mode = external_code.control_mode_from_value(unit.control_mode)

            if unit.module in model_free_preprocessors:
                model_net = None
            else:
-                model_net = Script.load_control_model(p, unet, unit.model, unit.low_vram)
+                model_net = Script.load_control_model(p, unet, unit.model)
                model_net.reset()

+                if getattr(model_net, 'is_control_lora', False):
+                    control_lora = model_net.control_model
+                    bind_control_lora(unet, control_lora)
+                    p.controlnet_control_loras.append(control_lora)
+
            input_image, image_from_a1111 = Script.choose_input_image(p, unit, idx)
            if image_from_a1111:
                a1111_i2i_resize_mode = getattr(p, "resize_mode", None)
@ -814,6 +773,25 @@ class Script(scripts.Script, metaclass=(
                # inpaint_only+lama is special and required outpaint fix
                _, input_image = Script.detectmap_proc(input_image, unit.module, resize_mode, hr_y, hr_x)

+            control_model_type = ControlModelType.ControlNet
+            global_average_pooling = False
+
+            if 'reference' in unit.module:
+                control_model_type = ControlModelType.AttentionInjection
+            elif 'revision' in unit.module:
+                control_model_type = ControlModelType.ReVision
+            elif hasattr(model_net, 'control_model') and (isinstance(model_net.control_model, Adapter) or isinstance(model_net.control_model, Adapter_light)):
+                control_model_type = ControlModelType.T2I_Adapter
+            elif hasattr(model_net, 'control_model') and isinstance(model_net.control_model, StyleAdapter):
+                control_model_type = ControlModelType.T2I_StyleAdapter
+            elif isinstance(model_net, PlugableIPAdapter):
+                control_model_type = ControlModelType.IPAdapter
+            elif isinstance(model_net, PlugableControlLLLite):
+                control_model_type = ControlModelType.Controlllite
+
+            if control_model_type is ControlModelType.ControlNet:
+                global_average_pooling = model_net.control_model.global_average_pooling
+
            preprocessor_resolution = unit.processor_res
            if unit.pixel_perfect:
                preprocessor_resolution = external_code.pixel_perfect_resolution(
@ -838,12 +816,6 @@ class Script(scripts.Script, metaclass=(
                thr_b=unit.threshold_b,
            )

-            if unit.module == "none" and "style" in unit.model:
-                detected_map_bytes = detected_map[:,:,0].tobytes()
-                detected_map = np.ndarray((round(input_image.shape[0]/4),input_image.shape[1]),dtype="float32",buffer=detected_map_bytes)
-                detected_map = torch.Tensor(detected_map).to(devices.get_device_for("controlnet"))
-                is_image = False
-
            if high_res_fix:
                if is_image:
                    hr_control, hr_detected_map = Script.detectmap_proc(detected_map, unit.module, resize_mode, hr_y, hr_x)
@ -858,25 +830,13 @@ class Script(scripts.Script, metaclass=(
                detected_maps.append((detected_map, unit.module))
            else:
                control = detected_map
-                if unit.module == 'clip_vision':
-                    detected_maps.append((processor.clip_vision_visualization(detected_map), unit.module))
+                detected_maps.append((input_image, unit.module))

-            control_model_type = ControlModelType.ControlNet
+            if control_model_type == ControlModelType.T2I_StyleAdapter:
+                control = control['last_hidden_state']

-            if isinstance(model_net, PlugableAdapter):
-                control_model_type = ControlModelType.T2I_Adapter
-
-            if getattr(model_net, "target", None) == "scripts.adapter.StyleAdapter":
-                control_model_type = ControlModelType.T2I_StyleAdapter
-
-            if 'reference' in unit.module:
-                control_model_type = ControlModelType.AttentionInjection
-
-            global_average_pooling = False
-
-            if model_net is not None:
-                if model_net.config.model.params.get("global_average_pooling", False):
-                    global_average_pooling = True
+            if control_model_type == ControlModelType.ReVision:
+                control = control['image_embeds']

            preprocessor_dict = dict(
                name=unit.module,
@ -928,16 +888,100 @@ class Script(scripts.Script, metaclass=(

                post_processors.append(inpaint_only_post_processing)

+            if 'recolor' in unit.module:
+                final_feed = hr_control if hr_control is not None else control
+                final_feed = final_feed.detach().cpu().numpy()
+                final_feed = np.ascontiguousarray(final_feed).copy()
+                final_feed = final_feed[0, 0, :, :].astype(np.float32)
+                final_feed = (final_feed * 255).clip(0, 255).astype(np.uint8)
+                Hfeed, Wfeed = final_feed.shape
+
+                if 'luminance' in unit.module:
+
+                    def recolor_luminance_post_processing(x):
+                        C, H, W = x.shape
+                        if Hfeed != H or Wfeed != W or C != 3:
+                            logger.error('Error: ControlNet find post-processing resolution mismatch. This could be related to other extensions hacked processing.')
+                            return x
+                        h = x.detach().cpu().numpy().transpose((1, 2, 0))
+                        h = (h * 255).clip(0, 255).astype(np.uint8)
+                        h = cv2.cvtColor(h, cv2.COLOR_RGB2LAB)
+                        h[:, :, 0] = final_feed
+                        h = cv2.cvtColor(h, cv2.COLOR_LAB2RGB)
+                        h = (h.astype(np.float32) / 255.0).transpose((2, 0, 1))
+                        y = torch.from_numpy(h).clip(0, 1).to(x)
+                        return y
+
+                    post_processors.append(recolor_luminance_post_processing)
+
+                if 'intensity' in unit.module:
+
+                    def recolor_intensity_post_processing(x):
+                        C, H, W = x.shape
+                        if Hfeed != H or Wfeed != W or C != 3:
+                            logger.error('Error: ControlNet find post-processing resolution mismatch. This could be related to other extensions hacked processing.')
+                            return x
+                        h = x.detach().cpu().numpy().transpose((1, 2, 0))
+                        h = (h * 255).clip(0, 255).astype(np.uint8)
+                        h = cv2.cvtColor(h, cv2.COLOR_RGB2HSV)
+                        h[:, :, 2] = final_feed
+                        h = cv2.cvtColor(h, cv2.COLOR_HSV2RGB)
+                        h = (h.astype(np.float32) / 255.0).transpose((2, 0, 1))
+                        y = torch.from_numpy(h).clip(0, 1).to(x)
+                        return y
+
+                    post_processors.append(recolor_intensity_post_processing)
+
            if '+lama' in unit.module:
                forward_param.used_hint_cond_latent = hook.UnetHook.call_vae_using_process(p, control)
-                setattr(p, 'controlnet_initial_noise_modifier', forward_param.used_hint_cond_latent)
+                self.noise_modifier = forward_param.used_hint_cond_latent
+
            del model_net

-        self.latest_network = UnetHook(lowvram=any(unit.low_vram for unit in self.enabled_units))
+        is_low_vram = any(unit.low_vram for unit in self.enabled_units)
+
+        self.latest_network = UnetHook(lowvram=is_low_vram)
        self.latest_network.hook(model=unet, sd_ldm=sd_ldm, control_params=forward_params, process=p)
+
+        revision_conds = 0
+        revision_conds_weight = 0
+        for param in forward_params:
+            if param.control_model_type == ControlModelType.ReVision:
+                revision_conds = revision_conds + param.hint_cond * param.weight
+                revision_conds_weight += param.weight
+        revision_conds_weight = max(revision_conds_weight, 1e-3)
+        self.latest_network.global_revision = revision_conds / revision_conds_weight
+
+        for param in forward_params:
+            if param.control_model_type == ControlModelType.IPAdapter:
+                param.control_model.hook(
+                    model=unet,
+                    clip_vision_output=param.hint_cond,
+                    weight=param.weight,
+                    dtype=torch.float32,
+                    start=param.start_guidance_percent,
+                    end=param.stop_guidance_percent
+                )
+            if param.control_model_type == ControlModelType.Controlllite:
+                param.control_model.hook(
+                    model=unet,
+                    cond=param.hint_cond,
+                    weight=param.weight,
+                    start=param.start_guidance_percent,
+                    end=param.stop_guidance_percent
+                )
+
        self.detected_map = detected_maps
        self.post_processors = post_processors

+    def before_process_batch(self, p, *args, **kwargs):
+        if self.noise_modifier is not None:
+            p.rng = HackedImageRNG(rng=p.rng,
+                                   noise_modifier=self.noise_modifier,
+                                   sd_model=p.sd_model)
+        self.noise_modifier = None
+        return
+
    def postprocess_batch(self, p, *args, **kwargs):
        images = kwargs.get('images', [])
        for post_processor in self.post_processors:
@ -946,8 +990,15 @@ class Script(scripts.Script, metaclass=(
        return

    def postprocess(self, p, processed, *args):
+        clear_all_secondary_control_models()
+
+        self.noise_modifier = None
+
+        for control_lora in getattr(p, 'controlnet_control_loras', []):
+            unbind_control_lora(control_lora)
+        p.controlnet_control_loras = []
+
        self.post_processors = []
-        setattr(p, 'controlnet_initial_noise_modifier', None)
        setattr(p, 'controlnet_vae_cache', None)

        processor_params_flag = (', '.join(getattr(processed, 'extra_generation_params', []))).lower()
@ -985,7 +1036,7 @@ class Script(scripts.Script, metaclass=(
                            ])

        self.input_image = None
-        self.latest_network.restore(p.sd_model.model.diffusion_model)
+        self.latest_network.restore()
        self.latest_network = None
        self.detected_map.clear()

@ -1017,25 +1068,21 @@ class Script(scripts.Script, metaclass=(
        self.input_image = None
        if self.latest_network is None: return

-        self.latest_network.restore(shared.sd_model.model.diffusion_model)
+        self.latest_network.restore()
        self.latest_network = None
        self.detected_map.clear()


 def on_ui_settings():
    section = ('control_net', "ControlNet")
-    shared.opts.add_option("control_net_model_config", shared.OptionInfo(
-        global_state.default_conf, "Config file for Control Net models", section=section))
-    shared.opts.add_option("control_net_model_adapter_config", shared.OptionInfo(
-        global_state.default_conf_adapter, "Config file for Adapter models", section=section))
    shared.opts.add_option("control_net_detectedmap_dir", shared.OptionInfo(
        global_state.default_detectedmap_dir, "Directory for detected maps auto saving", section=section))
    shared.opts.add_option("control_net_models_path", shared.OptionInfo(
        "", "Extra path to scan for ControlNet models (e.g. training output directory)", section=section))
    shared.opts.add_option("control_net_modules_path", shared.OptionInfo(
        "", "Path to directory containing annotator model directories (requires restart, overrides corresponding command line flag)", section=section))
-    shared.opts.add_option("control_net_max_models_num", shared.OptionInfo(
-        3, "Multi ControlNet: Max models amount (requires restart)", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}, section=section))
+    shared.opts.add_option("control_net_unit_count", shared.OptionInfo(
+        3, "Multi-ControlNet: ControlNet unit number (requires restart)", gr.Slider, {"minimum": 1, "maximum": 10, "step": 1}, section=section))
    shared.opts.add_option("control_net_model_cache_size", shared.OptionInfo(
        1, "Model cache size (requires restart)", gr.Slider, {"minimum": 1, "maximum": 5, "step": 1}, section=section))
    shared.opts.add_option("control_net_inpaint_blur_sigma", shared.OptionInfo(
--- a/scripts/controlnet_diffusers.py
+++ b/scripts/controlnet_diffusers.py
@ -0,0 +1,95 @@
+# https://gist.github.com/takuma104/4adfb3d968d80bea1d18a30c06439242
+# 2nd editing by lllyasviel
+
+import torch
+
+
+# =================#
+# UNet Conversion #
+# =================#
+
+unet_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("time_embed.0.weight", "time_embedding.linear_1.weight"),
+    ("time_embed.0.bias", "time_embedding.linear_1.bias"),
+    ("time_embed.2.weight", "time_embedding.linear_2.weight"),
+    ("time_embed.2.bias", "time_embedding.linear_2.bias"),
+    ("label_emb.0.0.weight", "add_embedding.linear_1.weight"),
+    ("label_emb.0.0.bias", "add_embedding.linear_1.bias"),
+    ("label_emb.0.2.weight", "add_embedding.linear_2.weight"),
+    ("label_emb.0.2.bias", "add_embedding.linear_2.bias"),
+    ("input_blocks.0.0.weight", "conv_in.weight"),
+    ("input_blocks.0.0.bias", "conv_in.bias"),
+    ("middle_block_out.0.weight", "controlnet_mid_block.weight"),
+    ("middle_block_out.0.bias", "controlnet_mid_block.bias"),
+]
+
+unet_conversion_map_resnet = [
+    # (stable-diffusion, HF Diffusers)
+    ("in_layers.0", "norm1"),
+    ("in_layers.2", "conv1"),
+    ("out_layers.0", "norm2"),
+    ("out_layers.3", "conv2"),
+    ("emb_layers.1", "time_emb_proj"),
+    ("skip_connection", "conv_shortcut"),
+]
+
+unet_conversion_map_layer = []
+# hardcoded number of downblocks and resnets/attentions...
+# would need smarter logic for other networks.
+for i in range(4):
+    # loop over downblocks/upblocks
+
+    for j in range(10):
+        # loop over resnets/attentions for downblocks
+        hf_down_res_prefix = f"down_blocks.{i}.resnets.{j}."
+        sd_down_res_prefix = f"input_blocks.{3*i + j + 1}.0."
+        unet_conversion_map_layer.append((sd_down_res_prefix, hf_down_res_prefix))
+
+        hf_down_atn_prefix = f"down_blocks.{i}.attentions.{j}."
+        sd_down_atn_prefix = f"input_blocks.{3 * i + j + 1}.1."
+        unet_conversion_map_layer.append((sd_down_atn_prefix, hf_down_atn_prefix))
+
+    hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0.conv."
+    sd_downsample_prefix = f"input_blocks.{3 * (i + 1)}.0.op."
+    unet_conversion_map_layer.append((sd_downsample_prefix, hf_downsample_prefix))
+
+
+hf_mid_atn_prefix = "mid_block.attentions.0."
+sd_mid_atn_prefix = "middle_block.1."
+unet_conversion_map_layer.append((sd_mid_atn_prefix, hf_mid_atn_prefix))
+
+for j in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{j}."
+    sd_mid_res_prefix = f"middle_block.{2*j}."
+    unet_conversion_map_layer.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+# controlnet specific
+
+controlnet_cond_embedding_names = ['conv_in'] + [f'blocks.{i}' for i in range(6)] + ['conv_out']
+for i, hf_prefix in enumerate(controlnet_cond_embedding_names):
+    hf_prefix = f"controlnet_cond_embedding.{hf_prefix}."
+    sd_prefix = f"input_hint_block.{i*2}."
+    unet_conversion_map_layer.append((sd_prefix, hf_prefix))
+
+for i in range(12):
+    hf_prefix = f"controlnet_down_blocks.{i}."
+    sd_prefix = f"zero_convs.{i}.0."
+    unet_conversion_map_layer.append((sd_prefix, hf_prefix))
+
+
+def convert_from_diffuser_state_dict(unet_state_dict):
+    mapping = {k: k for k in unet_state_dict.keys()}
+    for sd_name, hf_name in unet_conversion_map:
+        mapping[hf_name] = sd_name
+    for k, v in mapping.items():
+        if "resnets" in k:
+            for sd_part, hf_part in unet_conversion_map_resnet:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    for k, v in mapping.items():
+        for sd_part, hf_part in unet_conversion_map_layer:
+            v = v.replace(hf_part, sd_part)
+        mapping[k] = v
+    new_state_dict = {v: unet_state_dict[k] for k, v in mapping.items() if k in unet_state_dict}
+    return new_state_dict
--- a/scripts/controlnet_lllite.py
+++ b/scripts/controlnet_lllite.py
@ -0,0 +1,218 @@
+# https://github.com/kohya-ss/ControlNet-LLLite-ComfyUI/blob/main/node_control_net_lllite.py
+
+import re
+import torch
+
+from modules import devices
+
+
+class LLLiteModule(torch.nn.Module):
+    def __init__(
+        self,
+        name: str,
+        is_conv2d: bool,
+        in_dim: int,
+        depth: int,
+        cond_emb_dim: int,
+        mlp_dim: int,
+    ):
+        super().__init__()
+        self.name = name
+        self.is_conv2d = is_conv2d
+        self.is_first = False
+
+        modules = []
+        modules.append(torch.nn.Conv2d(3, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))  # to latent (from VAE) size*2
+        if depth == 1:
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+        elif depth == 2:
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=4, stride=4, padding=0))
+        elif depth == 3:
+            # kernel size 8は大きすぎるので、4にする / kernel size 8 is too large, so set it to 4
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim // 2, kernel_size=4, stride=4, padding=0))
+            modules.append(torch.nn.ReLU(inplace=True))
+            modules.append(torch.nn.Conv2d(cond_emb_dim // 2, cond_emb_dim, kernel_size=2, stride=2, padding=0))
+
+        self.conditioning1 = torch.nn.Sequential(*modules)
+
+        if self.is_conv2d:
+            self.down = torch.nn.Sequential(
+                torch.nn.Conv2d(in_dim, mlp_dim, kernel_size=1, stride=1, padding=0),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.mid = torch.nn.Sequential(
+                torch.nn.Conv2d(mlp_dim + cond_emb_dim, mlp_dim, kernel_size=1, stride=1, padding=0),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.up = torch.nn.Sequential(
+                torch.nn.Conv2d(mlp_dim, in_dim, kernel_size=1, stride=1, padding=0),
+            )
+        else:
+            self.down = torch.nn.Sequential(
+                torch.nn.Linear(in_dim, mlp_dim),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.mid = torch.nn.Sequential(
+                torch.nn.Linear(mlp_dim + cond_emb_dim, mlp_dim),
+                torch.nn.ReLU(inplace=True),
+            )
+            self.up = torch.nn.Sequential(
+                torch.nn.Linear(mlp_dim, in_dim),
+            )
+
+        self.depth = depth
+        self.cond_image = None
+        self.cond_emb = None
+
+    def set_cond_image(self, cond_image):
+        self.cond_image = cond_image
+        self.cond_emb = None
+
+    def forward(self, x):
+        if self.cond_emb is None:
+            # print(f"cond_emb is None, {self.name}")
+            cx = self.conditioning1(self.cond_image.to(x.device, dtype=x.dtype))
+            if not self.is_conv2d:
+                # reshape / b,c,h,w -> b,h*w,c
+                n, c, h, w = cx.shape
+                cx = cx.view(n, c, h * w).permute(0, 2, 1)
+            self.cond_emb = cx
+
+        cx = self.cond_emb
+
+        # uncond/condでxはバッチサイズが2倍
+        if x.shape[0] != cx.shape[0]:
+            if self.is_conv2d:
+                cx = cx.repeat(x.shape[0] // cx.shape[0], 1, 1, 1)
+            else:
+                # print("x.shape[0] != cx.shape[0]", x.shape[0], cx.shape[0])
+                cx = cx.repeat(x.shape[0] // cx.shape[0], 1, 1)
+
+        try:
+            cx = torch.cat([cx, self.down(x)], dim=1 if self.is_conv2d else 2)
+            cx = self.mid(cx)
+            cx = self.up(cx)
+            return cx
+        except RuntimeError as e:
+            # high-res fix shape mismatch
+            return 0
+
+
+all_hack = {}
+
+
+def clear_all_lllite():
+    global all_hack
+    for k, v in all_hack.items():
+        k.forward = v
+        k.lllite_list = []
+    all_hack = {}
+    return
+
+
+class PlugableControlLLLite(torch.nn.Module):
+    def __init__(self, state_dict):
+        super().__init__()
+        self.cache = {}
+
+        module_weights = {}
+        for key, value in state_dict.items():
+            fragments = key.split(".")
+            module_name = fragments[0]
+            weight_name = ".".join(fragments[1:])
+
+            if module_name not in module_weights:
+                module_weights[module_name] = {}
+            module_weights[module_name][weight_name] = value
+
+        modules = {}
+        for module_name, weights in module_weights.items():
+            if "conditioning1.4.weight" in weights:
+                depth = 3
+            elif weights["conditioning1.2.weight"].shape[-1] == 4:
+                depth = 2
+            else:
+                depth = 1
+
+            module = LLLiteModule(
+                name=module_name,
+                is_conv2d=weights["down.0.weight"].ndim == 4,
+                in_dim=weights["down.0.weight"].shape[1],
+                depth=depth,
+                cond_emb_dim=weights["conditioning1.0.weight"].shape[0] * 2,
+                mlp_dim=weights["down.0.weight"].shape[0],
+            )
+            info = module.load_state_dict(weights)
+            modules[module_name] = module
+            setattr(self, module_name, module)
+            if len(modules) == 1:
+                module.is_first = True
+
+        self.modules = modules
+        return
+
+    def reset(self):
+        self.cache = {}
+        return
+
+    @torch.no_grad()
+    def hook(self, model, cond, weight, start, end):
+        global all_hack
+
+        cond_image = cond * 2.0 - 1.0
+
+        for module in self.modules.values():
+            module.set_cond_image(cond_image)
+
+        for k, v in self.modules.items():
+            k = k.replace('middle_block', 'middle_blocks_0')
+            match = re.match("lllite_unet_(.*)_blocks_(.*)_1_transformer_blocks_(.*)_(.*)_to_(.*)", k, re.M | re.I)
+            assert match, 'Failed to load ControlLLLite!'
+            root = match.group(1)
+            block = match.group(2)
+            block_number = match.group(3)
+            attn_name = match.group(4)
+            proj_name = match.group(5)
+            if root == 'input':
+                b = model.input_blocks[int(block)][1].transformer_blocks[int(block_number)]
+            elif root == 'output':
+                b = model.output_blocks[int(block)][1].transformer_blocks[int(block_number)]
+            else:
+                b = model.middle_block[1].transformer_blocks[int(block_number)]
+            b = getattr(b, attn_name, None)
+            assert b is not None, 'Failed to load ControlLLLite!'
+            b = getattr(b, 'to_' + proj_name, None)
+            assert b is not None, 'Failed to load ControlLLLite!'
+
+            if not hasattr(b, 'lllite_list'):
+                b.lllite_list = []
+
+            if len(b.lllite_list) == 0:
+                all_hack[b] = b.forward
+                b.forward = self.get_hacked_forward(original_forward=b.forward, model=model, blk=b)
+
+            b.lllite_list.append((weight, start, end, v))
+        return
+
+    def get_hacked_forward(self, original_forward, model, blk):
+        @torch.no_grad()
+        def forward(x, **kwargs):
+            current_sampling_percent = getattr(model, 'current_sampling_percent', 0.5)
+            hackers = blk.lllite_list
+
+            hack = 0
+            
+            for weight, start, end, module in hackers:
+                module.to(x.device)
+                if current_sampling_percent < start or current_sampling_percent > end:
+                    hack = hack + 0
+                else:
+                    hack = hack + module(x) * weight
+
+            x = x + hack
+
+            return original_forward(x, **kwargs)
+        return forward
--- a/scripts/controlnet_lora.py
+++ b/scripts/controlnet_lora.py
@ -0,0 +1,191 @@
+import torch
+
+from contextlib import contextmanager
+from typing import Union, Tuple
+
+
+_size_2_t = Union[int, Tuple[int, int]]
+
+
+class LinearWithLoRA(torch.nn.Module):
+    def __init__(
+            self,
+            in_features: int,
+            out_features: int,
+            bias: bool = True,
+            device=None,
+            dtype=None) -> None:
+        super().__init__()
+        self.weight_module = None
+        self.up = None
+        self.down = None
+        self.bias = None
+        self.in_features = in_features
+        self.out_features = out_features
+        self.device = device
+        self.dtype = dtype
+        self.weight = None
+
+    def bind_lora(self, weight_module):
+        self.weight_module = [weight_module]
+
+    def unbind_lora(self):
+        if self.up is not None and self.down is not None:  # SAI's model is weird and needs this
+            self.weight_module = None
+
+    def get_original_weight(self):
+        if self.weight_module is None:
+            return None
+        return self.weight_module[0].weight
+
+    def forward(self, x):
+        if self.weight is not None:
+            return torch.nn.functional.linear(x, self.weight.to(x),
+                                              self.bias.to(x) if self.bias is not None else None)
+
+        original_weight = self.get_original_weight()
+
+        if original_weight is None:
+            return None  # A1111 needs first_time_calculation
+
+        if self.up is not None and self.down is not None:
+            weight = original_weight.to(x) + torch.mm(self.up, self.down).to(x)
+        else:
+            weight = original_weight.to(x)
+
+        return torch.nn.functional.linear(x, weight, self.bias.to(x) if self.bias is not None else None)
+
+
+class Conv2dWithLoRA(torch.nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: _size_2_t,
+        stride: _size_2_t = 1,
+        padding: Union[str, _size_2_t] = 0,
+        dilation: _size_2_t = 1,
+        groups: int = 1,
+        bias: bool = True,
+        padding_mode: str = 'zeros',
+        device=None,
+        dtype=None
+    ) -> None:
+        super().__init__()
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.weight_module = None
+        self.bias = None
+        self.up = None
+        self.down = None
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.padding_mode = padding_mode
+        self.device = device
+        self.dtype = dtype
+        self.weight = None
+
+    def bind_lora(self, weight_module):
+        self.weight_module = [weight_module]
+
+    def unbind_lora(self):
+        if self.up is not None and self.down is not None:  # SAI's model is weird and needs this
+            self.weight_module = None
+
+    def get_original_weight(self):
+        if self.weight_module is None:
+            return None
+        return self.weight_module[0].weight
+
+    def forward(self, x):
+        if self.weight is not None:
+            return torch.nn.functional.conv2d(x, self.weight.to(x), self.bias.to(x) if self.bias is not None else None,
+                                              self.stride, self.padding, self.dilation, self.groups)
+
+        original_weight = self.get_original_weight()
+
+        if original_weight is None:
+            return None  # A1111 needs first_time_calculation
+
+        if self.up is not None and self.down is not None:
+            weight = original_weight.to(x) + torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1)).reshape(original_weight.shape).to(x)
+        else:
+            weight = original_weight.to(x)
+
+        return torch.nn.functional.conv2d(x, weight, self.bias.to(x) if self.bias is not None else None,
+                                          self.stride, self.padding, self.dilation, self.groups)
+
+
+@contextmanager
+def controlnet_lora_hijack():
+    linear, conv2d = torch.nn.Linear, torch.nn.Conv2d
+    torch.nn.Linear, torch.nn.Conv2d = LinearWithLoRA, Conv2dWithLoRA
+    try:
+        yield
+    finally:
+        torch.nn.Linear, torch.nn.Conv2d = linear, conv2d
+
+
+def recursive_set(obj, key, value):
+    if obj is None:
+        return
+    if '.' in key:
+        k1, k2 = key.split('.', 1)
+        recursive_set(getattr(obj, k1, None), k2, value)
+    else:
+        setattr(obj, key, value)
+
+
+def force_load_state_dict(model, state_dict):
+    for k in list(state_dict.keys()):
+        recursive_set(model, k, torch.nn.Parameter(state_dict[k]))
+        del state_dict[k]
+    return
+
+
+def recursive_bind_lora(obj, key, value):
+    if obj is None:
+        return
+    if '.' in key:
+        k1, k2 = key.split('.', 1)
+        recursive_bind_lora(getattr(obj, k1, None), k2, value)
+    else:
+        target = getattr(obj, key, None)
+        if target is not None and hasattr(target, 'bind_lora'):
+            target.bind_lora(value)
+
+
+def recursive_get(obj, key):
+    if obj is None:
+        return
+    if '.' in key:
+        k1, k2 = key.split('.', 1)
+        return recursive_get(getattr(obj, k1, None), k2)
+    else:
+        return getattr(obj, key, None)
+
+
+def bind_control_lora(base_model, control_lora_model):
+    sd = base_model.state_dict()
+    keys = list(sd.keys())
+    keys = list(set([k.rsplit('.', 1)[0] for k in keys]))
+    module_dict = {k: recursive_get(base_model, k) for k in keys}
+    for k, v in module_dict.items():
+        recursive_bind_lora(control_lora_model, k, v)
+
+
+def torch_dfs(model: torch.nn.Module):
+    result = [model]
+    for child in model.children():
+        result += torch_dfs(child)
+    return result
+
+
+def unbind_control_lora(control_lora_model):
+    for m in torch_dfs(control_lora_model):
+        if hasattr(m, 'unbind_lora'):
+            m.unbind_lora()
+    return
--- a/scripts/controlnet_model_guess.py
+++ b/scripts/controlnet_model_guess.py
@ -0,0 +1,216 @@
+import copy
+import os
+import torch
+from pathlib import Path
+from modules import devices
+
+from scripts.adapter import PlugableAdapter, Adapter, StyleAdapter, Adapter_light
+from scripts.controlnet_lllite import PlugableControlLLLite
+from scripts.cldm import PlugableControlModel
+from scripts.controlmodel_ipadapter import PlugableIPAdapter
+from scripts.logging import logger
+from scripts.controlnet_diffusers import convert_from_diffuser_state_dict
+from scripts.controlnet_lora import controlnet_lora_hijack, force_load_state_dict
+
+controlnet_default_config = {'adm_in_channels': None,
+                             'in_channels': 4,
+                             'model_channels': 320,
+                             'num_res_blocks': 2,
+                             'attention_resolutions': [1, 2, 4],
+                             'transformer_depth': [1, 1, 1, 0],
+                             'channel_mult': [1, 2, 4, 4],
+                             'transformer_depth_middle': 1,
+                             'use_linear_in_transformer': False,
+                             'context_dim': 768,
+                             "num_heads": 8,
+                             "global_average_pooling": False}
+
+controlnet_sdxl_config = {'num_classes': 'sequential',
+                          'adm_in_channels': 2816,
+                          'in_channels': 4,
+                          'model_channels': 320,
+                          'num_res_blocks': 2,
+                          'attention_resolutions': [2, 4],
+                          'transformer_depth': [0, 2, 10],
+                          'channel_mult': [1, 2, 4],
+                          'transformer_depth_middle': 10,
+                          'use_linear_in_transformer': True,
+                          'context_dim': 2048,
+                          "num_head_channels": 64,
+                          "global_average_pooling": False}
+
+controlnet_sdxl_mid_config = {'num_classes': 'sequential',
+                              'adm_in_channels': 2816,
+                              'in_channels': 4,
+                              'model_channels': 320,
+                              'num_res_blocks': 2,
+                              'attention_resolutions': [4],
+                              'transformer_depth': [0, 0, 1],
+                              'channel_mult': [1, 2, 4],
+                              'transformer_depth_middle': 1,
+                              'use_linear_in_transformer': True,
+                              'context_dim': 2048,
+                              "num_head_channels": 64,
+                              "global_average_pooling": False}
+
+controlnet_sdxl_small_config = {'num_classes': 'sequential',
+                                'adm_in_channels': 2816,
+                                'in_channels': 4,
+                                'model_channels': 320,
+                                'num_res_blocks': 2,
+                                'attention_resolutions': [],
+                                'transformer_depth': [0, 0, 0],
+                                'channel_mult': [1, 2, 4],
+                                'transformer_depth_middle': 0,
+                                'use_linear_in_transformer': True,
+                                "num_head_channels": 64,
+                                'context_dim': 1,
+                                "global_average_pooling": False}
+
+t2i_adapter_config = {
+    'channels': [320, 640, 1280, 1280],
+    'nums_rb': 2,
+    'ksize': 1,
+    'sk': True,
+    'cin': 192,
+    'use_conv': False
+}
+
+t2i_adapter_light_config = {
+    'channels': [320, 640, 1280, 1280],
+    'nums_rb': 4,
+    'cin': 192,
+}
+
+t2i_adapter_style_config = {
+    'width': 1024,
+    'context_dim': 768,
+    'num_head': 8,
+    'n_layes': 3,
+    'num_token': 8,
+}
+
+
+def build_model_by_guess(state_dict, unet, model_path):
+    if "lora_controlnet" in state_dict:
+        del state_dict['lora_controlnet']
+        config = copy.deepcopy(controlnet_sdxl_config)
+        logger.info('controlnet_sdxl_config (using lora)')
+        config['global_average_pooling'] = False
+        config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
+        config['use_fp16'] = devices.dtype_unet == torch.float16
+        with controlnet_lora_hijack():
+            network = PlugableControlModel(config, state_dict=None)
+        force_load_state_dict(network.control_model, state_dict)
+        network.is_control_lora = True
+        network.to(devices.dtype_unet)
+        return network
+
+    if "controlnet_cond_embedding.conv_in.weight" in state_dict:
+        state_dict = convert_from_diffuser_state_dict(state_dict)
+
+    model_has_shuffle_in_filename = 'shuffle' in Path(os.path.abspath(model_path)).stem.lower()
+    state_dict = {k.replace("control_model.", ""): v for k, v in state_dict.items()}
+    state_dict = {k.replace("adapter.", ""): v for k, v in state_dict.items()}
+
+    if 'input_hint_block.0.weight' in state_dict:
+        if 'label_emb.0.0.bias' not in state_dict:
+            config = copy.deepcopy(controlnet_default_config)
+            logger.info('controlnet_default_config')
+            config['global_average_pooling'] = model_has_shuffle_in_filename
+            config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
+            config['context_dim'] = int(state_dict['input_blocks.5.1.transformer_blocks.0.attn2.to_k.weight'].shape[1])
+            for key in state_dict.keys():
+                p = state_dict[key]
+                if 'proj_in.weight' in key or 'proj_out.weight' in key:
+                    if len(p.shape) == 2:
+                        p = p[..., None, None]
+                state_dict[key] = p
+        else:
+            has_full_layers = 'input_blocks.8.1.transformer_blocks.9.norm3.weight' in state_dict
+            has_mid_layers = 'input_blocks.8.1.transformer_blocks.0.norm3.weight' in state_dict
+            if has_full_layers:
+                config = copy.deepcopy(controlnet_sdxl_config)
+                logger.info('controlnet_sdxl_config')
+            elif has_mid_layers:
+                config = copy.deepcopy(controlnet_sdxl_mid_config)
+                logger.info('controlnet_sdxl_mid_config')
+            else:
+                config = copy.deepcopy(controlnet_sdxl_small_config)
+                logger.info('controlnet_sdxl_small_config')
+            config['global_average_pooling'] = False
+            config['hint_channels'] = int(state_dict['input_hint_block.0.weight'].shape[1])
+
+        if 'difference' in state_dict and unet is not None:
+            unet_state_dict = unet.state_dict()
+            unet_state_dict_keys = unet_state_dict.keys()
+            final_state_dict = {}
+            for key in state_dict.keys():
+                p = state_dict[key]
+                if key in unet_state_dict_keys:
+                    p_new = p + unet_state_dict[key].clone().cpu()
+                else:
+                    p_new = p
+                final_state_dict[key] = p_new
+            state_dict = final_state_dict
+
+        config['use_fp16'] = devices.dtype_unet == torch.float16
+
+        network = PlugableControlModel(config, state_dict)
+        network.to(devices.dtype_unet)
+        return network
+
+    if 'conv_in.weight' in state_dict:
+        logger.info('t2i_adapter_config')
+        cin = int(state_dict['conv_in.weight'].shape[1])
+        channel = int(state_dict['conv_in.weight'].shape[0])
+        ksize = 1
+        down_opts = tuple(filter(lambda item: item.endswith("down_opt.op.weight"), state_dict))
+        use_conv = len(down_opts) > 0
+        is_sdxl = (cin % 256) == 0
+        adapter = Adapter(
+            cin=cin,
+            channels=[channel, channel*2, channel*4, channel*4],
+            nums_rb=2,
+            ksize=ksize,
+            sk=True,
+            use_conv=use_conv,
+            is_sdxl=is_sdxl
+        ).cpu()
+        adapter.load_state_dict(state_dict, strict=False)
+        network = PlugableAdapter(adapter)
+        return network
+
+    if 'style_embedding' in state_dict:
+        config = copy.deepcopy(t2i_adapter_style_config)
+        logger.info('t2i_adapter_style_config')
+        adapter = StyleAdapter(**config).cpu()
+        adapter.load_state_dict(state_dict, strict=False)
+        network = PlugableAdapter(adapter)
+        return network
+
+    if 'body.0.in_conv.weight' in state_dict:
+        config = copy.deepcopy(t2i_adapter_light_config)
+        logger.info('t2i_adapter_light_config')
+        config['cin'] = int(state_dict['body.0.in_conv.weight'].shape[1])
+        adapter = Adapter_light(**config).cpu()
+        adapter.load_state_dict(state_dict, strict=False)
+        network = PlugableAdapter(adapter)
+        return network
+
+    if 'ip_adapter' in state_dict:
+        plus = "latents" in state_dict["image_proj"]
+        if plus:
+            channel = int(state_dict['image_proj']['proj_in.weight'].shape[1])
+        else:
+            channel = int(state_dict['image_proj']['proj.weight'].shape[1])
+        network = PlugableIPAdapter(state_dict, channel, plus)
+        network.to('cpu')
+        return network
+
+    if any('lllite' in k for k in state_dict.keys()):
+        network = PlugableControlLLLite(state_dict)
+        network.to('cpu')
+        return network
+
+    raise '[ControlNet Error] Cannot recognize the ControlModel!'
--- a/scripts/controlnet_ui/controlnet_ui_group.py
+++ b/scripts/controlnet_ui/controlnet_ui_group.py
@ -14,6 +14,7 @@ from scripts import (
 )
 from scripts.processor import (
    preprocessor_sliders_config,
+    no_control_mode_preprocessors,
    flag_preprocessor_resolution,
    model_free_preprocessors,
    preprocessor_filters,
@ -566,6 +567,8 @@ class ControlNetUiGroup(object):
                0, self.prevent_next_n_slider_value_update - 1
            )

+            grs += [gr.update(visible=module not in no_control_mode_preprocessors)]
+
            return grs

        inputs = [
@ -579,6 +582,7 @@ class ControlNetUiGroup(object):
            self.advanced,
            self.model,
            self.refresh_models,
+            self.control_mode
        ]
        self.module.change(build_sliders, inputs=inputs, outputs=outputs)
        self.pixel_perfect.change(build_sliders, inputs=inputs, outputs=outputs)
@ -682,24 +686,14 @@ class ControlNetUiGroup(object):
                else None,
            )

-            if "clip" in module:
-                result = processor.clip_vision_visualization(result)
+            if not is_image:
+                result = img
                is_image = True

-            if is_image:
-                result = external_code.visualize_inpaint_mask(result)
-                return (
-                    # Update to `generated_image`
-                    gr.update(value=result, visible=True, interactive=False),
-                    # preprocessor_preview
-                    gr.update(value=True),
-                    # openpose editor
-                    *self.openpose_editor.update(json_acceptor.value),
-                )
-
+            result = external_code.visualize_inpaint_mask(result)
            return (
                # Update to `generated_image`
-                gr.update(value=None, visible=True),
+                gr.update(value=result, visible=True, interactive=False),
                # preprocessor_preview
                gr.update(value=True),
                # openpose editor
--- a/scripts/controlnet_version.py
+++ b/scripts/controlnet_version.py
@ -1,4 +1,4 @@
-version_flag = 'v1.1.313'
+version_flag = 'v1.1.400'

 from scripts.logging import logger

--- a/scripts/global_state.py
+++ b/scripts/global_state.py
@ -63,7 +63,11 @@ cn_preprocessor_modules = {
    "openpose_faceonly": functools.partial(g_openpose_model.run_model, include_body=False, include_hand=False, include_face=True),
    "openpose_full": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=True, include_face=True),
    "dw_openpose_full": functools.partial(g_openpose_model.run_model, include_body=True, include_hand=True, include_face=True, use_dw_pose=True),
-    "clip_vision": clip,
+    "clip_vision": functools.partial(clip, config='clip_vitl'),
+    "revision_clipvision": functools.partial(clip, config='clip_g'),
+    "revision_ignore_prompt": functools.partial(clip, config='clip_g'),
+    "ip-adapter_clip_sd15": functools.partial(clip, config='clip_h'),
+    "ip-adapter_clip_sdxl": functools.partial(clip, config='clip_g'),
    "color": color,
    "pidinet": pidinet,
    "pidinet_safe": pidinet_safe,
@ -93,13 +97,19 @@ cn_preprocessor_modules = {
    "inpaint_only+lama": lama_inpaint,
    "tile_colorfix": identity,
    "tile_colorfix+sharp": identity,
+    "recolor_luminance": recolor_luminance,
+    "recolor_intensity": recolor_intensity,
 }

 cn_preprocessor_unloadable = {
    "hed": unload_hed,
    "fake_scribble": unload_hed,
    "mlsd": unload_mlsd,
-    "clip": unload_clip,
+    "clip_vision": functools.partial(unload_clip, config='clip_vitl'),
+    "revision_clipvision": functools.partial(unload_clip, config='clip_g'),
+    "revision_ignore_prompt": functools.partial(unload_clip, config='clip_g'),
+    "ip-adapter_clip_sd15": functools.partial(unload_clip, config='clip_h'),
+    "ip-adapter_clip_sdxl": functools.partial(unload_clip, config='clip_g'),
    "depth": unload_midas,
    "depth_leres": unload_leres,
    "normal_map": unload_midas,
@ -148,18 +158,19 @@ ui_preprocessor_keys += sorted([preprocessor_aliases.get(k, k)

 reverse_preprocessor_aliases = {preprocessor_aliases[k]: k for k in preprocessor_aliases.keys()}

+
 def get_module_basename(module: Optional[str]) -> str:
    if module is None:
        module = 'none'
    return reverse_preprocessor_aliases.get(module, module)

-default_conf = os.path.join("models", "cldm_v15.yaml")
-default_conf_adapter = os.path.join("models", "t2iadapter_sketch_sd14v1.yaml")
+
 default_detectedmap_dir = os.path.join("detected_maps")
 script_dir = scripts.basedir()

 os.makedirs(cn_models_dir, exist_ok=True)

+
 def traverse_all_files(curr_path, model_list):
    f_list = [
        (os.path.join(curr_path, entry.name), entry.stat())
@ -241,14 +252,14 @@ def select_control_type(control_type: str) -> Tuple[List[str], List[str], str, s
    filtered_preprocessor_list = [
        x
        for x in preprocessor_list
-        if pattern in x.lower() or x.lower() == "none"
+        if pattern in x.lower() or any(a in x.lower() for a in preprocessor_filters_aliases.get(pattern, [])) or x.lower() == "none"
    ]
    if pattern in ["canny", "lineart", "scribble", "mlsd"]:
        filtered_preprocessor_list += [
            x for x in preprocessor_list if "invert" in x.lower()
        ]
    filtered_model_list = [
-        x for x in model_list if pattern in x.lower() or x.lower() == "none"
+        x for x in model_list if pattern in x.lower() or any(a in x.lower() for a in preprocessor_filters_aliases.get(pattern, [])) or x.lower() == "none"
    ]
    if default_option not in filtered_preprocessor_list:
        default_option = filtered_preprocessor_list[0]
--- a/scripts/hook.py
+++ b/scripts/hook.py
@ -3,9 +3,10 @@ import einops
 import hashlib
 import numpy as np
 import torch.nn as nn
-
+from functools import partial
 import modules.processing

+
 from enum import Enum
 from scripts.logging import logger
 from modules import devices, lowvram, shared, scripts
@ -21,6 +22,13 @@ from modules.prompt_parser import MulticondLearnedConditioning, ComposableSchedu
 from modules.processing import StableDiffusionProcessing


+try:
+    from sgm.modules.attention import BasicTransformerBlock as BasicTransformerBlockSGM
+except:
+    print('Webui version too old!')
+    BasicTransformerBlockSGM = BasicTransformerBlock
+
+
 POSITIVE_MARK_TOKEN = 1024
 NEGATIVE_MARK_TOKEN = - POSITIVE_MARK_TOKEN
 MARK_EPS = 1e-3
@ -45,12 +53,20 @@ def mark_prompt_context(x, positive):
        x.schedules = mark_prompt_context(x.schedules, positive)
        return x
    if isinstance(x, ScheduledPromptConditioning):
-        cond = x.cond
-        if prompt_context_is_marked(cond):
-            return x
-        mark = POSITIVE_MARK_TOKEN if positive else NEGATIVE_MARK_TOKEN
-        cond = torch.cat([torch.zeros_like(cond)[:1] + mark, cond], dim=0)
-        return ScheduledPromptConditioning(end_at_step=x.end_at_step, cond=cond)
+        if isinstance(x.cond, dict):
+            cond = x.cond['crossattn']
+            if prompt_context_is_marked(cond):
+                return x
+            mark = POSITIVE_MARK_TOKEN if positive else NEGATIVE_MARK_TOKEN
+            cond = torch.cat([torch.zeros_like(cond)[:1] + mark, cond], dim=0)
+            return ScheduledPromptConditioning(end_at_step=x.end_at_step, cond=dict(crossattn=cond, vector=x.cond['vector']))
+        else:
+            cond = x.cond
+            if prompt_context_is_marked(cond):
+                return x
+            mark = POSITIVE_MARK_TOKEN if positive else NEGATIVE_MARK_TOKEN
+            cond = torch.cat([torch.zeros_like(cond)[:1] + mark, cond], dim=0)
+            return ScheduledPromptConditioning(end_at_step=x.end_at_step, cond=cond)
    return x


@ -91,27 +107,22 @@ def unmark_prompt_context(x):
    return mark_batch, uc_indices, context


-def create_random_tensors_hacked(*args, **kwargs):
-    result = modules.processing.create_random_tensors_original(*args, **kwargs)
-    p = kwargs.get('p', None)
-    if p is None:
-        return result
-    controlnet_initial_noise_modifier = getattr(p, 'controlnet_initial_noise_modifier', None)
-    if controlnet_initial_noise_modifier is not None:
-        x0 = controlnet_initial_noise_modifier
+class HackedImageRNG:
+    def __init__(self, rng, noise_modifier, sd_model):
+        self.rng = rng
+        self.noise_modifier = noise_modifier
+        self.sd_model = sd_model
+
+    def next(self):
+        result = self.rng.next()
+        x0 = self.noise_modifier
        if result.shape[2] != x0.shape[2] or result.shape[3] != x0.shape[3]:
            return result
        x0 = x0.to(result.dtype).to(result.device)
-        ts = torch.tensor([p.sd_model.num_timesteps - 1] * result.shape[0]).long().to(result.device)
-        result = p.sd_model.q_sample(x0, ts, result)
+        ts = torch.tensor([999] * result.shape[0]).long().to(result.device)
+        result = predict_q_sample(self.sd_model, x0, ts, result)
        logger.info(f'[ControlNet] Initial noise hack applied to {result.shape}.')
-    return result
-
-
-if getattr(modules.processing, 'create_random_tensors_original', None) is None:
-    modules.processing.create_random_tensors_original = modules.processing.create_random_tensors
-
-modules.processing.create_random_tensors = create_random_tensors_hacked
+        return result


 class ControlModelType(Enum):
@ -129,6 +140,9 @@ class ControlModelType(Enum):
    StableSR = "StableSR, Jianyi Wang"
    PromptDiffusion = "PromptDiffusion, Zhendong Wang"
    ControlLoRA = "ControlLoRA, Wu Hecong"
+    ReVision = "ReVision, Stability"
+    IPAdapter = "IPAdapter, Hu Ye"
+    Controlllite = "Controlllite, Kohya"


 # Written by Lvmin
@ -252,12 +266,41 @@ def torch_dfs(model: torch.nn.Module):
    return result


+def register_schedule(self):
+    linear_start = 0.00085
+    linear_end = 0.0120
+    num_timesteps = 1000
+
+    betas = (torch.linspace(linear_start ** 0.5, linear_end ** 0.5, num_timesteps, dtype=torch.float64) ** 2.0).numpy()
+
+    alphas = 1. - betas
+    alphas_cumprod = np.cumprod(alphas, axis=0)
+    alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])
+
+    to_torch = partial(torch.tensor, dtype=torch.float32)
+
+    setattr(self, 'betas', to_torch(betas))
+    # setattr(self, 'alphas_cumprod', to_torch(alphas_cumprod))  # a1111 already has this
+    setattr(self, 'alphas_cumprod_prev', to_torch(alphas_cumprod_prev))
+    setattr(self, 'sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod)))
+    setattr(self, 'sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod)))
+    setattr(self, 'log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod)))
+    setattr(self, 'sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod)))
+    setattr(self, 'sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1)))
+
+
+def predict_q_sample(ldm, x_start, t, noise=None):
+    if noise is None:
+        noise = torch.randn_like(x_start)
+    return extract_into_tensor(ldm.sqrt_alphas_cumprod.to(x_start), t, x_start.shape) * x_start + extract_into_tensor(ldm.sqrt_one_minus_alphas_cumprod.to(x_start), t, x_start.shape) * noise
+
+
 def predict_start_from_noise(ldm, x_t, t, noise):
-    return extract_into_tensor(ldm.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod, t, x_t.shape) * noise
+    return extract_into_tensor(ldm.sqrt_recip_alphas_cumprod.to(x_t), t, x_t.shape) * x_t - extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod.to(x_t), t, x_t.shape) * noise


 def predict_noise_from_start(ldm, x_t, t, x0):
-    return (extract_into_tensor(ldm.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t - x0) / extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod, t, x_t.shape)
+    return (extract_into_tensor(ldm.sqrt_recip_alphas_cumprod.to(x_t), t, x_t.shape) * x_t - x0) / extract_into_tensor(ldm.sqrt_recipm1_alphas_cumprod.to(x_t), t, x_t.shape)


 def blur(x, k):
@ -298,6 +341,7 @@ class UnetHook(nn.Module):
        self.gn_auto_machine_weight = 1.0
        self.current_style_fidelity = 0.0
        self.current_uc_indices = None
+        self.global_revision = None

    @staticmethod
    def call_vae_using_process(p, x, batch_size=None, mask=None):
@ -317,6 +361,15 @@ class UnetHook(nn.Module):
                with devices.autocast():
                    vae_output = p.sd_model.encode_first_stage(x)
                    vae_output = p.sd_model.get_first_stage_encoding(vae_output)
+                    if torch.all(torch.isnan(vae_output)).item():
+                        logger.info(f'ControlNet find Nans in the VAE encoding. \n '
+                                    f'Now ControlNet will automatically retry.\n '
+                                    f'To always start with 32-bit VAE, use --no-half-vae commandline flag.')
+                        devices.dtype_vae = torch.float32
+                        x = x.to(devices.dtype_vae)
+                        p.sd_model.first_stage_model.to(devices.dtype_vae)
+                        vae_output = p.sd_model.encode_first_stage(x)
+                        vae_output = p.sd_model.get_first_stage_encoding(vae_output)
                vae_cache.set(x, vae_output)
                logger.info(f'ControlNet used {str(devices.dtype_vae)} VAE to encode {vae_output.shape}.')
            latent = vae_output
@ -332,12 +385,16 @@ class UnetHook(nn.Module):
        for param in self.control_params:
            current_sampling_percent = (x.sampling_step / x.total_sampling_steps)
            param.guidance_stopped = current_sampling_percent < param.start_guidance_percent or current_sampling_percent > param.stop_guidance_percent
+            if self.model is not None:
+                self.model.current_sampling_percent = current_sampling_percent

    def hook(self, model, sd_ldm, control_params, process):
        self.model = model
        self.sd_ldm = sd_ldm
        self.control_params = control_params

+        model_is_sdxl = getattr(self.sd_ldm, 'is_sdxl', False)
+
        outer = self

        def process_sample(*args, **kwargs):
@ -354,17 +411,28 @@ class UnetHook(nn.Module):
            mark_prompt_context(getattr(process, 'hr_uc', []), positive=False)
            return process.sample_before_CN_hack(*args, **kwargs)

-        def forward(self, x, timesteps=None, context=None, **kwargs):
-            total_controlnet_embedding = [0.0] * 13
+        def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+            is_sdxl = y is not None and model_is_sdxl
            total_t2i_adapter_embedding = [0.0] * 4
+            if is_sdxl:
+                total_controlnet_embedding = [0.0] * 10
+            else:
+                total_controlnet_embedding = [0.0] * 13
            require_inpaint_hijack = False
            is_in_high_res_fix = False
            batch_size = int(x.shape[0])

            # Handle cond-uncond marker
            cond_mark, outer.current_uc_indices, context = unmark_prompt_context(context)
+            outer.model.cond_mark = cond_mark
            # logger.info(str(cond_mark[:, 0, 0, 0].detach().cpu().numpy().tolist()) + ' - ' + str(outer.current_uc_indices))

+            # Revision
+            if is_sdxl and isinstance(outer.global_revision, torch.Tensor):
+                y[:, :1280] = outer.global_revision * cond_mark[:, :, 0, 0]
+                if any('ignore_prompt' in param.preprocessor['name'] for param in outer.control_params):
+                    context = torch.zeros_like(context)
+
            # High-res fix
            for param in outer.control_params:
                # select which hint_cond to use
@ -404,6 +472,19 @@ class UnetHook(nn.Module):
                    continue
                param.used_hint_cond_latent = outer.call_vae_using_process(process, param.used_hint_cond, batch_size=batch_size)

+            # vram
+            for param in outer.control_params:
+                if getattr(param.control_model, 'disable_memory_management', False):
+                    continue
+
+                if param.control_model is not None:
+                    if outer.lowvram and is_sdxl and hasattr(param.control_model, 'aggressive_lowvram'):
+                        param.control_model.aggressive_lowvram()
+                    elif hasattr(param.control_model, 'fullvram'):
+                        param.control_model.fullvram()
+                    elif hasattr(param.control_model, 'to'):
+                        param.control_model.to(devices.get_device_for("controlnet"))
+
            # handle prompt token control
            for param in outer.control_params:
                if no_high_res_control:
@ -415,7 +496,6 @@ class UnetHook(nn.Module):
                if param.control_model_type not in [ControlModelType.T2I_StyleAdapter]:
                    continue

-                param.control_model.to(devices.get_device_for("controlnet"))
                control = param.control_model(x=x, hint=param.used_hint_cond, timesteps=timesteps, context=context)
                control = torch.cat([control.clone() for _ in range(batch_size)], dim=0)
                control *= param.weight
@ -433,7 +513,6 @@ class UnetHook(nn.Module):
                if param.control_model_type not in [ControlModelType.ControlNet, ControlModelType.T2I_Adapter]:
                    continue

-                param.control_model.to(devices.get_device_for("controlnet"))
                # inpaint model workaround
                x_in = x
                control_model = param.control_model.control_model
@ -455,11 +534,12 @@ class UnetHook(nn.Module):
                    m = (m > 0.5).float()
                    hint = c * (1 - m) - m

-                control = param.control_model(x=x_in, hint=hint, timesteps=timesteps, context=context)
-                control_scales = ([param.weight] * 13)
+                control = param.control_model(x=x_in, hint=hint, timesteps=timesteps, context=context, y=y)

-                if outer.lowvram:
-                    param.control_model.to("cpu")
+                if is_sdxl:
+                    control_scales = [param.weight] * 10
+                else:
+                    control_scales = [param.weight] * 13

                if param.cfg_injection or param.global_average_pooling:
                    if param.control_model_type == ControlModelType.T2I_Adapter:
@ -484,6 +564,9 @@ class UnetHook(nn.Module):
                    elif param.control_model_type == ControlModelType.ControlNet:
                        control_scales = [param.weight * (0.825 ** float(12 - i)) for i in range(13)]

+                if is_sdxl and param.control_model_type == ControlModelType.ControlNet:
+                    control_scales = control_scales[:10]
+
                if param.advanced_weighting is not None:
                    control_scales = param.advanced_weighting

@ -502,6 +585,8 @@ class UnetHook(nn.Module):

            # Replace x_t to support inpaint models
            for param in outer.control_params:
+                if not isinstance(param.used_hint_cond, torch.Tensor):
+                    continue
                if param.used_hint_cond.shape[1] != 4:
                    continue
                if x.shape[1] != 9:
@ -518,8 +603,14 @@ class UnetHook(nn.Module):
                    param.used_hint_inpaint_hijack.to(x.dtype).to(x.device)
                x = torch.cat([x[:, :4, :, :], param.used_hint_inpaint_hijack], dim=1)

+            # vram
+            for param in outer.control_params:
+                if param.control_model is not None:
+                    if outer.lowvram:
+                        param.control_model.to('cpu')
+
            # A1111 fix for medvram.
-            if shared.cmd_opts.medvram:
+            if shared.cmd_opts.medvram or (getattr(shared.cmd_opts, 'medvram_sdxl', False) and is_sdxl):
                try:
                    # Trigger the register_forward_pre_hook
                    outer.sd_ldm.model()
@ -549,7 +640,7 @@ class UnetHook(nn.Module):
                if param.control_model_type not in [ControlModelType.AttentionInjection]:
                    continue

-                ref_xt = outer.sd_ldm.q_sample(param.used_hint_cond_latent, torch.round(timesteps.float()).long())
+                ref_xt = predict_q_sample(outer.sd_ldm, param.used_hint_cond_latent, torch.round(timesteps.float()).long())

                # Inpaint Hijack
                if x.shape[1] == 9:
@ -562,6 +653,12 @@ class UnetHook(nn.Module):
                outer.current_style_fidelity = float(param.preprocessor['threshold_a'])
                outer.current_style_fidelity = max(0.0, min(1.0, outer.current_style_fidelity))

+                if is_sdxl:
+                    # sdxl's attention hacking is highly unstable.
+                    # We have no other methods but to reduce the style_fidelity a bit.
+                    # By default, 0.5 ** 3.0 = 0.125
+                    outer.current_style_fidelity = outer.current_style_fidelity ** 3.0
+
                if param.cfg_injection:
                    outer.current_style_fidelity = 1.0
                elif param.soft_injection or is_in_high_res_fix:
@ -577,11 +674,19 @@ class UnetHook(nn.Module):
                    outer.gn_auto_machine = AutoMachine.Write
                    outer.gn_auto_machine_weight = param.weight

-                outer.original_forward(
-                    x=ref_xt.to(devices.dtype_unet),
-                    timesteps=timesteps.to(devices.dtype_unet),
-                    context=context.to(devices.dtype_unet)
-                )
+                if is_sdxl:
+                    outer.original_forward(
+                        x=ref_xt.to(devices.dtype_unet),
+                        timesteps=timesteps.to(devices.dtype_unet),
+                        context=context.to(devices.dtype_unet),
+                        y=y
+                    )
+                else:
+                    outer.original_forward(
+                        x=ref_xt.to(devices.dtype_unet),
+                        timesteps=timesteps.to(devices.dtype_unet),
+                        context=context.to(devices.dtype_unet)
+                    )

                outer.attention_auto_machine = AutoMachine.Read
                outer.gn_auto_machine = AutoMachine.Read
@ -591,11 +696,18 @@ class UnetHook(nn.Module):
            with th.no_grad():
                t_emb = cond_cast_unet(timestep_embedding(timesteps, self.model_channels, repeat_only=False))
                emb = self.time_embed(t_emb)
-                h = x.type(self.dtype)
+
+                if is_sdxl:
+                    assert y.shape[0] == x.shape[0]
+                    emb = emb + self.label_emb(y)
+
+                h = x
                for i, module in enumerate(self.input_blocks):
                    h = module(h, emb, context)

-                    if (i + 1) % 3 == 0:
+                    t2i_injection = [3, 5, 8] if is_sdxl else [2, 5, 8, 11]
+
+                    if i in t2i_injection:
                        h = aligned_adding(h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack)

                    hs.append(h)
@ -604,6 +716,9 @@ class UnetHook(nn.Module):
            # U-Net Middle Block
            h = aligned_adding(h, total_controlnet_embedding.pop(), require_inpaint_hijack)

+            if len(total_t2i_adapter_embedding) > 0 and is_sdxl:
+                h = aligned_adding(h, total_t2i_adapter_embedding.pop(0), require_inpaint_hijack)
+
            # U-Net Decoder
            for i, module in enumerate(self.output_blocks):
                h = th.cat([h, aligned_adding(hs.pop(), total_controlnet_embedding.pop(), require_inpaint_hijack)], dim=1)
@ -668,18 +783,23 @@ class UnetHook(nn.Module):

            return h

+        def move_all_control_model_to_cpu():
+            for param in getattr(outer, 'control_params', []):
+                if isinstance(param.control_model, torch.nn.Module):
+                    param.control_model.to("cpu")
+
        def forward_webui(*args, **kwargs):
            # webui will handle other compoments 
            try:
                if shared.cmd_opts.lowvram:
                    lowvram.send_everything_to_cpu()
-
                return forward(*args, **kwargs)
+            except Exception as e:
+                move_all_control_model_to_cpu()
+                raise e
            finally:
-                if self.lowvram:
-                    for param in self.control_params:
-                        if isinstance(param.control_model, torch.nn.Module):
-                            param.control_model.to("cpu")
+                if outer.lowvram:
+                    move_all_control_model_to_cpu()

        def hacked_basic_transformer_inner_forward(self, x, context=None):
            x_norm1 = self.norm1(x)
@ -716,7 +836,7 @@ class UnetHook(nn.Module):

        def hacked_group_norm_forward(self, *args, **kwargs):
            eps = 1e-6
-            x = self.original_forward(*args, **kwargs)
+            x = self.original_forward_cn_hijack(*args, **kwargs)
            y = None
            if outer.gn_auto_machine == AutoMachine.Write:
                if outer.gn_auto_machine_weight > self.gn_weight:
@ -752,56 +872,78 @@ class UnetHook(nn.Module):
        outer.original_forward = model.forward
        model.forward = forward_webui.__get__(model, UNetModel)

+        if model_is_sdxl:
+            register_schedule(sd_ldm)
+
+        need_attention_hijack = False
+
+        for param in outer.control_params:
+            if param.control_model_type in [ControlModelType.AttentionInjection]:
+                need_attention_hijack = True
+
        all_modules = torch_dfs(model)

-        attn_modules = [module for module in all_modules if isinstance(module, BasicTransformerBlock)]
-        attn_modules = sorted(attn_modules, key=lambda x: - x.norm1.normalized_shape[0])
+        if need_attention_hijack:
+            attn_modules = [module for module in all_modules if isinstance(module, BasicTransformerBlock) or isinstance(module, BasicTransformerBlockSGM)]
+            attn_modules = sorted(attn_modules, key=lambda x: - x.norm1.normalized_shape[0])

-        for i, module in enumerate(attn_modules):
-            if getattr(module, '_original_inner_forward', None) is None:
-                module._original_inner_forward = module._forward
-            module._forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
-            module.bank = []
-            module.style_cfgs = []
-            module.attn_weight = float(i) / float(len(attn_modules))
+            for i, module in enumerate(attn_modules):
+                if getattr(module, '_original_inner_forward_cn_hijack', None) is None:
+                    module._original_inner_forward_cn_hijack = module._forward
+                module._forward = hacked_basic_transformer_inner_forward.__get__(module, BasicTransformerBlock)
+                module.bank = []
+                module.style_cfgs = []
+                module.attn_weight = float(i) / float(len(attn_modules))

-        gn_modules = [model.middle_block]
-        model.middle_block.gn_weight = 0
+            gn_modules = [model.middle_block]
+            model.middle_block.gn_weight = 0

-        input_block_indices = [4, 5, 7, 8, 10, 11]
-        for w, i in enumerate(input_block_indices):
-            module = model.input_blocks[i]
-            module.gn_weight = 1.0 - float(w) / float(len(input_block_indices))
-            gn_modules.append(module)
+            if model_is_sdxl:
+                input_block_indices = [4, 5, 7, 8]
+                output_block_indices = [0, 1, 2, 3, 4, 5]
+            else:
+                input_block_indices = [4, 5, 7, 8, 10, 11]
+                output_block_indices = [0, 1, 2, 3, 4, 5, 6, 7]

-        output_block_indices = [0, 1, 2, 3, 4, 5, 6, 7]
-        for w, i in enumerate(output_block_indices):
-            module = model.output_blocks[i]
-            module.gn_weight = float(w) / float(len(output_block_indices))
-            gn_modules.append(module)
+            for w, i in enumerate(input_block_indices):
+                module = model.input_blocks[i]
+                module.gn_weight = 1.0 - float(w) / float(len(input_block_indices))
+                gn_modules.append(module)

-        for i, module in enumerate(gn_modules):
-            if getattr(module, 'original_forward', None) is None:
-                module.original_forward = module.forward
-            module.forward = hacked_group_norm_forward.__get__(module, torch.nn.Module)
-            module.mean_bank = []
-            module.var_bank = []
-            module.style_cfgs = []
-            module.gn_weight *= 2
+            for w, i in enumerate(output_block_indices):
+                module = model.output_blocks[i]
+                module.gn_weight = float(w) / float(len(output_block_indices))
+                gn_modules.append(module)

-        outer.attn_module_list = attn_modules
-        outer.gn_module_list = gn_modules
+            for i, module in enumerate(gn_modules):
+                if getattr(module, 'original_forward_cn_hijack', None) is None:
+                    module.original_forward_cn_hijack = module.forward
+                module.forward = hacked_group_norm_forward.__get__(module, torch.nn.Module)
+                module.mean_bank = []
+                module.var_bank = []
+                module.style_cfgs = []
+                module.gn_weight *= 2
+
+            outer.attn_module_list = attn_modules
+            outer.gn_module_list = gn_modules
+        else:
+            for module in enumerate(all_modules):
+                _original_inner_forward_cn_hijack = getattr(module, '_original_inner_forward_cn_hijack', None)
+                original_forward_cn_hijack = getattr(module, 'original_forward_cn_hijack', None)
+                if _original_inner_forward_cn_hijack is not None:
+                    module._forward = _original_inner_forward_cn_hijack
+                if original_forward_cn_hijack is not None:
+                    module.forward = original_forward_cn_hijack
+            outer.attn_module_list = []
+            outer.gn_module_list = []

        scripts.script_callbacks.on_cfg_denoiser(self.guidance_schedule_handler)

-    def restore(self, model):
+    def restore(self):
        scripts.script_callbacks.remove_callbacks_for_function(self.guidance_schedule_handler)
-        if hasattr(self, "control_params"):
-            del self.control_params
+        self.control_params = None

-        if not hasattr(model, "_original_forward"):
-            # no such handle, ignore
-            return
-
-        model.forward = model._original_forward
-        del model._original_forward
+        if self.model is not None:
+            if hasattr(self.model, "_original_forward"):
+                self.model.forward = self.model._original_forward
+                del self.model._original_forward
--- a/scripts/movie2movie.py
+++ b/scripts/movie2movie.py
@ -83,7 +83,7 @@ class Script(scripts.Script):
        # The returned values are passed to the run method as parameters.
        
        ctrls_group = ()
-        max_models = opts.data.get("control_net_max_models_num", 1)
+        max_models = opts.data.get("control_net_unit_count", 3)

        with gr.Group():
            with gr.Accordion("ControlNet-M2M", open = False):
@ -109,7 +109,7 @@ class Script(scripts.Script):
        # to be used in processing. The return value should be a Processed object, which is
        # what is returned by the process_images method.
        
-        contents_num = opts.data.get("control_net_max_models_num", 1)
+        contents_num = opts.data.get("control_net_unit_count", 3)
        arg_num = 3
        item_list = []
        video_list = []
--- a/scripts/processor.py
+++ b/scripts/processor.py
@ -336,30 +336,28 @@ def unload_pidinet():
        unload_pid_model()


-clip_encoder = None
+clip_encoder = {
+    'clip_g': None,
+    'clip_h': None,
+    'clip_vitl': None,
+}


-def clip(img, res=512, **kwargs):
+def clip(img, res=512, config='clip_vitl', **kwargs):
    img = HWC3(img)
    global clip_encoder
-    if clip_encoder is None:
-        from annotator.clip import apply_clip
-        clip_encoder = apply_clip
-    result = clip_encoder(img)
+    if clip_encoder[config] is None:
+        from annotator.clipvision import ClipVisionDetector
+        clip_encoder[config] = ClipVisionDetector(config)
+    result = clip_encoder[config](img)
    return result, False


-def clip_vision_visualization(x):
-    x = x.detach().cpu().numpy()[0]
-    x = np.ascontiguousarray(x).copy()
-    return np.ndarray((x.shape[0] * 4, x.shape[1]), dtype="uint8", buffer=x.tobytes())
-
-
-def unload_clip():
+def unload_clip(config='clip_vitl'):
    global clip_encoder
-    if clip_encoder is not None:
-        from annotator.clip import unload_clip_model
-        unload_clip_model()
+    if clip_encoder[config] is not None:
+        clip_encoder[config].unload_model()
+        clip_encoder[config] = None


 model_color = None
@ -597,10 +595,39 @@ def shuffle(img, res=512, **kwargs):
    return result, True


+def recolor_luminance(img, res=512, thr_a=1.0, **kwargs):
+    result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2LAB)
+    result = result[:, :, 0].astype(np.float32) / 255.0
+    result = result ** thr_a
+    result = (result * 255.0).clip(0, 255).astype(np.uint8)
+    result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
+    return result, True
+
+
+def recolor_intensity(img, res=512, thr_a=1.0, **kwargs):
+    result = cv2.cvtColor(HWC3(img), cv2.COLOR_BGR2HSV)
+    result = result[:, :, 2].astype(np.float32) / 255.0
+    result = result ** thr_a
+    result = (result * 255.0).clip(0, 255).astype(np.uint8)
+    result = cv2.cvtColor(result, cv2.COLOR_GRAY2RGB)
+    return result, True
+
+
 model_free_preprocessors = [
    "reference_only",
    "reference_adain",
-    "reference_adain+attn"
+    "reference_adain+attn",
+    "revision_clipvision",
+    "revision_ignore_prompt"
+]
+
+no_control_mode_preprocessors = [
+    "revision_clipvision",
+    "revision_ignore_prompt",
+    "clip_vision",
+    "ip-adapter_clip_sd15",
+    "ip-adapter_clip_sdxl",
+    "t2ia_style_clipvision"
 ]

 flag_preprocessor_resolution = "Preprocessor Resolution"
@ -608,6 +635,8 @@ preprocessor_sliders_config = {
    "none": [],
    "inpaint": [],
    "inpaint_only": [],
+    "revision_clipvision": [],
+    "revision_ignore_prompt": [],
    "canny": [
        {
            "name": flag_preprocessor_resolution,
@ -900,23 +929,55 @@ preprocessor_sliders_config = {
            "step": 0.01
        }
    ],
+    "recolor_luminance": [
+        None,
+        {
+            "name": "Gamma Correction",
+            "value": 1.0,
+            "min": 0.1,
+            "max": 2.0,
+            "step": 0.001
+        }
+    ],
+    "recolor_intensity": [
+        None,
+        {
+            "name": "Gamma Correction",
+            "value": 1.0,
+            "min": 0.1,
+            "max": 2.0,
+            "step": 0.001
+        }
+    ],
 }

 preprocessor_filters = {
    "All": "none",
    "Canny": "canny",
    "Depth": "depth_midas",
-    "Normal": "normal_bae",
+    "NormalMap": "normal_bae",
    "OpenPose": "openpose_full",
    "MLSD": "mlsd",
    "Lineart": "lineart_standard (from white bg & black line)",
    "SoftEdge": "softedge_pidinet",
-    "Scribble": "scribble_pidinet",
-    "Seg": "seg_ofade20k",
+    "Scribble/Sketch": "scribble_pidinet",
+    "Segmentation": "seg_ofade20k",
    "Shuffle": "shuffle",
    "Tile": "tile_resample",
    "Inpaint": "inpaint_only",
-    "IP2P": "none",
+    "InstructP2P": "none",
    "Reference": "reference_only",
-    "T2IA": "none",
+    "Recolor": "recolor_luminance",
+    "Revision": "revision_clipvision",
+    "T2I-Adapter": "none",
+    "IP-Adapter": "ip-adapter_clip_sd15",
 }
+
+preprocessor_filters_aliases = {
+    'instructp2p': ['ip2p'],
+    'segmentation': ['seg'],
+    'normalmap': ['normal'],
+    't2i-adapter': ['t2i_adapter', 't2iadapter', 't2ia'],
+    'ip-adapter': ['ip_adapter', 'ipadapter'],
+    'scribble/sketch': ['scribble', 'sketch']
+}  # must use all lower texts
--- a/scripts/utils.py
+++ b/scripts/utils.py
@ -4,24 +4,20 @@ import functools
 import time
 import base64
 import numpy as np
-import gradio as gr
+import safetensors.torch
 import logging

 from typing import Any, Callable, Dict
-
+from modules.safe import unsafe_torch_load
 from scripts.logging import logger


 def load_state_dict(ckpt_path, location="cpu"):
    _, extension = os.path.splitext(ckpt_path)
    if extension.lower() == ".safetensors":
-        import safetensors.torch
-
        state_dict = safetensors.torch.load_file(ckpt_path, device=location)
    else:
-        state_dict = get_state_dict(
-            torch.load(ckpt_path, map_location=torch.device(location))
-        )
+        state_dict = unsafe_torch_load(ckpt_path, map_location=torch.device(location))
    state_dict = get_state_dict(state_dict)
    logger.info(f"Loaded state_dict from [{ckpt_path}]")
    return state_dict
--- a/tests/external_code_api/external_code_test.py
+++ b/tests/external_code_api/external_code_test.py
@ -26,13 +26,13 @@ class TestExternalCodeWorking(unittest.TestCase):
        self.scripts.alwayson_scripts = [self.cn_script]
        self.script_args = [None] * self.cn_script.args_from

-        self.initial_max_models = shared.opts.data.get("control_net_max_models_num", 1)
-        shared.opts.data.update(control_net_max_models_num=self.max_models)
+        self.initial_max_models = shared.opts.data.get("control_net_unit_count", 3)
+        shared.opts.data.update(control_net_unit_count=self.max_models)

        self.extra_models = 0

    def tearDown(self):
-        shared.opts.data.update(control_net_max_models_num=self.initial_max_models)
+        shared.opts.data.update(control_net_unit_count=self.initial_max_models)

    def get_expected_args_to(self):
        args_len = max(self.max_models, len(self.cn_units))