apple · atiorh · Sep 26, 2023 · Aug 3, 2023 · Aug 3, 2023 · Aug 4, 2023
diff --git a/README.md b/README.md
@@ -209,10 +209,11 @@ An example `<selected-recipe-string-key>` would be `"recipe_4.50_bit_mixedpalett
 e.g.:
 
 ```bash
-python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --convert-vae-decoder --convert-text-encoder --xl-version --model-version stabilityai/stable-diffusion-xl-base-1.0 --bundle-resources-for-swift-cli --attention-implementation ORIGINAL -o <output-dir>
+python -m python_coreml_stable_diffusion.torch2coreml --convert-unet --convert-vae-decoder --convert-text-encoder --xl-version --model-version stabilityai/stable-diffusion-xl-base-1.0 --refiner-version stabilityai/stable-diffusion-xl-refiner-1.0 --bundle-resources-for-swift-cli --attention-implementation ORIGINAL -o <output-dir>
 ```
 
 - `--xl-version`: Additional argument to pass to the conversion script when specifying an XL model
+- `--refiner-version`: Additional argument to pass to the conversion script when specifying an XL refiner model, required for ["Ensemble of Expert Denoisers"](https://huggingface.co/docs/diffusers/main/en/api/pipelines/stable_diffusion/stable_diffusion_xl#1-ensemble-of-expert-denoisers) inference.
 - `--attention-implementation ORIGINAL` (recommended for `cpuAndGPU`)
 - Due to known float16 overflow issues in the VAE, it runs in float32 precision for now
 
@@ -225,7 +226,7 @@ swift run StableDiffusionSample <prompt> --resource-path <output-mlpackages-dire
 ```
 
 - Only `--compute-units cpuAndGPU` is supported for now
-- Only the `base` model is supported, `refiner` model is not yet supported
+- Only the `base` model is required, `refiner` model is optional and will be used by default if provided in the resource directory
 - ControlNet for XL is not yet supported
 
 
@@ -365,6 +366,7 @@ This generally takes 15-20 minutes on an M1 MacBook Pro. Upon successful executi
 
 - `--model-version`: The model version name as published on the [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion)
 
+- `--refiner-version`: The refiner version name as published on the [Hugging Face Hub](https://huggingface.co/models?search=stable-diffusion). This is optional and if specified, this argument will convert and bundle the refiner unet alongside the model unet.
 
 - `--bundle-resources-for-swift-cli`: Compiles all 4 models and bundles them along with necessary resources for text tokenization into `<output-mlpackages-directory>/Resources` which should provided as input to the Swift package. This flag is not necessary for the diffusers-based Python pipeline.
 
@@ -439,7 +441,7 @@ This Swift package contains two products:
 
 Both of these products require the Core ML models and tokenization resources to be supplied. When specifying resources via a directory path that directory must contain the following:
 
-- `TextEncoder.mlmodelc` (text embedding model)
+- `TextEncoder.mlmodelc` or `TextEncoder2.mlmodelc (text embedding model)
 - `Unet.mlmodelc` or `UnetChunk1.mlmodelc` & `UnetChunk2.mlmodelc` (denoising autoencoder model)
 - `VAEDecoder.mlmodelc` (image decoder model)
 - `vocab.json` (tokenizer vocabulary file)
@@ -453,6 +455,10 @@ Optionally, it may also include the safety checker model that some versions of S
 
 - `SafetyChecker.mlmodelc`
 
+Optionally, for the SDXL refiner:
+
+- `UnetRefiner.mlmodelc` (refiner unet model) 
+
 Optionally, for ControlNet:
 
 - `ControlledUNet.mlmodelc` or `ControlledUnetChunk1.mlmodelc` & `ControlledUnetChunk2.mlmodelc` (enabled to receive ControlNet values)

diff --git a/python_coreml_stable_diffusion/torch2coreml.py b/python_coreml_stable_diffusion/torch2coreml.py
diff --git a/swift/StableDiffusion/pipeline/CGImage+vImage.swift b/swift/StableDiffusion/pipeline/CGImage+vImage.swift
@@ -4,6 +4,7 @@
 import Foundation
 import Accelerate
 import CoreML
+import CoreGraphics
 
 @available(iOS 16.0, macOS 13.0, *)
 extension CGImage {
@@ -77,7 +78,7 @@ extension CGImage {
             else {
                 throw ShapedArrayError.incorrectFormatsConvertingToShapedArray
             }
-            
+
             var sourceImageBuffer = try vImage_Buffer(cgImage: self)
 
             var mediumDestination = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: mediumFormat.bitsPerPixel)
@@ -88,7 +89,7 @@ extension CGImage {
                 nil,
                 vImage_Flags(kvImagePrintDiagnosticsToConsole),
                 nil)
-            
+
             guard let converter = converter?.takeRetainedValue() else {
                 throw ShapedArrayError.vImageConverterNotInitialized
             }
@@ -99,7 +100,7 @@ extension CGImage {
             var destinationR = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
             var destinationG = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
             var destinationB = try vImage_Buffer(width: Int(width), height: Int(height), bitsPerPixel: 8 * UInt32(MemoryLayout<Float>.size))
-            
+
             var minFloat: [Float] = Array(repeating: minValue, count: 4)
             var maxFloat: [Float] = Array(repeating: maxValue, count: 4)
 
@@ -125,7 +126,56 @@ extension CGImage {
             let imageData = redData + greenData + blueData
 
             let shapedArray = MLShapedArray<Float32>(data: imageData, shape: [1, 3, self.height, self.width])
-
+
+            return shapedArray
+    }
+
+    private func normalizePixelValues(pixel: UInt8) -> Float {
+        return (Float(pixel) / 127.5) - 1.0
+    }
+
+    public func toRGBShapedArray(minValue: Float, maxValue: Float)
+        throws -> MLShapedArray<Float32> {
+            let image = self
+            let width = image.width
+            let height = image.height
+            let alphaMaskValue: Float = minValue
+
+            guard let colorSpace = CGColorSpace(name: CGColorSpace.sRGB),
+                  let context = CGContext(data: nil, width: width, height: height, bitsPerComponent: 8, bytesPerRow: 4 * width, space: colorSpace, bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue),
+                  let ptr = context.data?.bindMemory(to: UInt8.self, capacity: width * height * 4) else {
+                return []
+            }
+
+            context.draw(image, in: CGRect(x: 0, y: 0, width: width, height: height))
+
+            var redChannel = [Float](repeating: 0, count: width * height)
+            var greenChannel = [Float](repeating: 0, count: width * height)
+            var blueChannel = [Float](repeating: 0, count: width * height)
+
+            for y in 0..<height {
+                for x in 0..<width {
+                    let i = 4 * (y * width + x)
+                    if ptr[i+3] == 0 {
+                        // Alpha mask for controlnets
+                        redChannel[y * width + x] = alphaMaskValue
+                        greenChannel[y * width + x] = alphaMaskValue
+                        blueChannel[y * width + x] = alphaMaskValue
+                    } else {
+                        redChannel[y * width + x] = normalizePixelValues(pixel: ptr[i])
+                        greenChannel[y * width + x] = normalizePixelValues(pixel: ptr[i+1])
+                        blueChannel[y * width + x] = normalizePixelValues(pixel: ptr[i+2])
+                    }
+                }
+            }
+
+            let colorShape = [1, 1, height, width]
+            let redShapedArray = MLShapedArray<Float32>(scalars: redChannel, shape: colorShape)
+            let greenShapedArray = MLShapedArray<Float32>(scalars: greenChannel, shape: colorShape)
+            let blueShapedArray = MLShapedArray<Float32>(scalars: blueChannel, shape: colorShape)
+
+            let shapedArray = MLShapedArray<Float32>(concatenating: [redShapedArray, greenShapedArray, blueShapedArray], alongAxis: 1)
+
             return shapedArray
     }
 }

diff --git a/swift/StableDiffusion/pipeline/Encoder.swift b/swift/StableDiffusion/pipeline/Encoder.swift
@@ -93,7 +93,7 @@ public struct Encoder: ResourceManaging {
 
     var inputDescription: MLFeatureDescription {
         try! model.perform { model in
-            model.modelDescription.inputDescriptionsByName["z"]!
+            model.modelDescription.inputDescriptionsByName.first!.value
         }
     }
 

diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.Configuration.swift
@@ -20,8 +20,14 @@ public struct PipelineConfiguration: Hashable {
     public var negativePrompt: String = ""
     /// Starting image for image2image or in-painting
     public var startingImage: CGImage? = nil
-    //public var maskImage: CGImage? = nil
+    /// Fraction of inference steps to be used in `.imageToImage` pipeline mode
+    /// Must be between 0 and 1
+    /// Higher values will result in greater transformation of the `startingImage`
     public var strength: Float = 1.0
+    /// Fraction of inference steps to at which to start using the refiner unet if present in `textToImage` mode
+    /// Must be between 0 and 1
+    /// Higher values will result in fewer refiner steps
+    public var refinerStart: Float = 0.8
     /// Number of images to generate
     public var imageCount: Int = 1
     /// Number of inference steps to perform
@@ -44,7 +50,19 @@ public struct PipelineConfiguration: Hashable {
     public var encoderScaleFactor: Float32 = 0.18215
     /// Scale factor to use on the latent before decoding
     public var decoderScaleFactor: Float32 = 0.18215
-
+    /// If `originalSize` is not the same as `targetSize` the image will appear to be down- or upsampled.
+    /// Part of SDXL’s micro-conditioning as explained in section 2.2 of https://huggingface.co/papers/2307.01952.
+    public var originalSize: Float32 = 1024
+    /// `cropsCoordsTopLeft` can be used to generate an image that appears to be “cropped” from the position `cropsCoordsTopLeft` downwards.
+    /// Favorable, well-centered images are usually achieved by setting `cropsCoordsTopLeft` to (0, 0).
+    public var cropsCoordsTopLeft: Float32 = 0
+    /// For most cases, `target_size` should be set to the desired height and width of the generated image.
+    public var targetSize: Float32 = 1024
+    /// Used to simulate an aesthetic score of the generated image by influencing the positive text condition.
+    public var aestheticScore: Float32 = 6
+    /// Can be used to simulate an aesthetic score of the generated image by influencing the negative text condition.
+    public var negativeAestheticScore: Float32 = 2.5
+
     /// Given the configuration, what mode will be used for generation
     public var mode: PipelineMode {
         guard startingImage != nil else {

diff --git a/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift b/swift/StableDiffusion/pipeline/StableDiffusionPipeline.swift
@@ -24,7 +24,9 @@ public enum StableDiffusionRNG {
 }
 
 public enum PipelineError: String, Swift.Error {
+    case missingUnetInputs
     case startingImageProvidedWithoutEncoder
+    case startingText2ImgWithoutTextEncoder
     case unsupportedOSVersion
 }
 

diff --git a/swift/StableDiffusion/pipeline/StableDiffusionXL+Resources.swift b/swift/StableDiffusion/pipeline/StableDiffusionXL+Resources.swift
@@ -15,6 +15,9 @@ public extension StableDiffusionXLPipeline {
         public let unetURL: URL
         public let unetChunk1URL: URL
         public let unetChunk2URL: URL
+        public let unetRefinerURL: URL
+        public let unetRefinerChunk1URL: URL
+        public let unetRefinerChunk2URL: URL
         public let decoderURL: URL
         public let encoderURL: URL
         public let vocabURL: URL
@@ -26,6 +29,9 @@ public extension StableDiffusionXLPipeline {
             unetURL = baseURL.appending(path: "Unet.mlmodelc")
             unetChunk1URL = baseURL.appending(path: "UnetChunk1.mlmodelc")
             unetChunk2URL = baseURL.appending(path: "UnetChunk2.mlmodelc")
+            unetRefinerURL = baseURL.appending(path: "UnetRefiner.mlmodelc")
+            unetRefinerChunk1URL = baseURL.appending(path: "UnetRefinerChunk1.mlmodelc")
+            unetRefinerChunk2URL = baseURL.appending(path: "UnetRefinerChunk2.mlmodelc")
             decoderURL = baseURL.appending(path: "VAEDecoder.mlmodelc")
             encoderURL = baseURL.appending(path: "VAEEncoder.mlmodelc")
             vocabURL = baseURL.appending(path: "vocab.json")
@@ -51,7 +57,12 @@ public extension StableDiffusionXLPipeline {
         /// Expect URL of each resource
         let urls = ResourceURLs(resourcesAt: baseURL)
         let tokenizer = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL)
-        let textEncoder = TextEncoderXL(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
+        let textEncoder: TextEncoderXL?
+        if FileManager.default.fileExists(atPath: urls.textEncoderURL.path) {
+            textEncoder = TextEncoderXL(tokenizer: tokenizer, modelAt: urls.textEncoderURL, configuration: config)
+        } else {
+            textEncoder = nil
+        }
 
         // padToken is different in the second XL text encoder
         let tokenizer2 = try BPETokenizer(mergesAt: urls.mergesURL, vocabularyAt: urls.vocabURL, padToken: "!")
@@ -67,6 +78,17 @@ public extension StableDiffusionXLPipeline {
             unet = Unet(modelAt: urls.unetURL, configuration: config)
         }
 
+        // Refiner Unet model
+        let unetRefiner: Unet?
+        if FileManager.default.fileExists(atPath: urls.unetRefinerChunk1URL.path) &&
+            FileManager.default.fileExists(atPath: urls.unetRefinerChunk2URL.path) {
+            unetRefiner = Unet(chunksAt: [urls.unetRefinerChunk1URL, urls.unetRefinerChunk2URL],
+                               configuration: config)
+        } else {
+            unetRefiner = Unet(modelAt: urls.unetRefinerURL, configuration: config)
+        }
+
+
         // Image Decoder
         let decoder = Decoder(modelAt: urls.decoderURL, configuration: config)
 
@@ -83,6 +105,7 @@ public extension StableDiffusionXLPipeline {
             textEncoder: textEncoder,
             textEncoder2: textEncoder2,
             unet: unet,
+            unetRefiner: unetRefiner,
             decoder: decoder,
             encoder: encoder,
             reduceMemory: reduceMemory