refactor: enhance diffusion component detection

thxCode · thxCode · commit 31bc6f589f82 · 2025-07-18T23:45:52.000+08:00
Signed-off-by: thxCode &lt;thxcode0824@gmail.com&gt;
diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go
@@ -1844,7 +1844,7 @@ func mainAction(c *cli.Context) error {
 				}
 				bds = [][]any{
 					{
-						a.DiffusionArchitecture,
+						sprintf(tenary(a.DiffusionArchitecture != "", a.DiffusionArchitecture, "N/A")),
 						sprintf(tenary(a.DiffusionHasConditioners(), a.DiffusionConditioners, "N/A")),
 						sprintf(tenary(a.DiffusionHasAutoencoder(), a.DiffusionAutoencoder, "N/A")),
 					},
@@ -2000,14 +2000,14 @@ func mainAction(c *cli.Context) error {
 		for i := range lmes.Items {
 			if !inShort {
 				bds[i] = []any{
-					sprintf(lmes.Architecture),
+					sprintf(tenary(lmes.Architecture != "", lmes.Architecture, "N/A")),
 					sprintf(lmes.ContextSize),
 					sprintf("%d / %d", lmes.LogicalBatchSize, lmes.PhysicalBatchSize),
 					sprintf(tenary(flashAttention, tenary(lmes.FlashAttention, "Enabled", "Unsupported"), "Disabled")),
 					sprintf(tenary(mmap, tenary(!lmes.NoMMap, "Enabled", "Unsupported"), "Disabled")),
 					sprintf(tenary(lmes.EmbeddingOnly, "Yes", "No")),
 					sprintf(tenary(lmes.Reranking, "Supported", "Unsupported")),
-					sprintf(tenary(lmes.Distributable, "Supported", "Unsupported")),
+					sprintf(tenary(lmes.Architecture != "" && lmes.Distributable, "Supported", "Unsupported")),
 					sprintf(tenary(lmes.Items[i].FullOffloaded, sprintf("%d (%d + 1)",
 						lmes.Items[i].OffloadLayers, lmes.Items[i].OffloadLayers-1), lmes.Items[i].OffloadLayers)),
 					sprintf(tenary(lmes.Items[i].FullOffloaded, "Yes", "No")),
@@ -2071,10 +2071,10 @@ func mainAction(c *cli.Context) error {
 		for i := range sdes.Items {
 			if !inShort {
 				bds[i] = []any{
-					sprintf(sdes.Architecture),
+					sprintf(tenary(sdes.Architecture != "", sdes.Architecture, "N/A")),
 					sprintf(tenary(flashAttention, tenary(sdes.FlashAttention, "Enabled", "Unsupported"), "Disabled")),
 					sprintf(tenary(mmap, tenary(!sdes.NoMMap, "Enabled", "Unsupported"), "Disabled")),
-					sprintf(tenary(sdes.Distributable, "Supported", "Unsupported")),
+					sprintf(tenary(sdes.Architecture != "" && sdes.Distributable, "Supported", "Unsupported")),
 					sprintf(tenary(sdes.Items[i].FullOffloaded, "Yes", "No")),
 				}
 			}
diff --git a/file_architecture.go b/file_architecture.go
@@ -414,11 +414,15 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 
 		// Conditioner
 
-		openAiClipVitL14Key = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight"   // OpenAI CLIP ViT-L/14
-		openClipVitH14Key   = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight"   // OpenCLIP ViT-H/14
-		openClipVitG14Key   = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14
-		t5xxlKey            = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"      // Google T5-xxl
-		t5xxlKey2           = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"
+		openAiClipVitL14Key  = "cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight" // OpenAI CLIP ViT-L/14
+		openAiClipVitL14Key2 = "text_model.encoder.layers.11.self_attn.k_proj.weight"
+		openClipVitH14Key    = "cond_stage_model.transformer.text_model.encoder.layers.22.self_attn.k_proj.weight" // OpenCLIP ViT-H/14
+		openClipVitH14Key2   = "text_model.encoder.layers.22.self_attn.k_proj.weight"
+		openClipVitG14Key    = "cond_stage_model.1.transformer.text_model.encoder.layers.31.self_attn.k_proj.weight" // OpenCLIP ViT-G/14
+		openClipVitG14Key2   = "text_model.encoder.layers.31.self_attn.k_proj.weight"
+		t5xxlKey             = "cond_stage_model.1.transformer.encoder.block.23.layer.0.SelfAttention.k.weight" // Google T5-xxl
+		t5xxlKey2            = "cond_stage_model.2.transformer.encoder.block.23.layer.0.SelfAttention.k.weight"
+		t5xxlKey3            = "encoder.block.23.layer.0.SelfAttention.k.weight"
 	)
 
 	tis, _ := gf.TensorInfos.Index([]string{
@@ -439,10 +443,14 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 		fluxFillFeatureKey2,
 
 		openAiClipVitL14Key,
+		openAiClipVitL14Key2,
 		openClipVitH14Key,
+		openClipVitH14Key2,
 		openClipVitG14Key,
+		openClipVitG14Key2,
 		t5xxlKey,
 		t5xxlKey2,
+		t5xxlKey3,
 	})
 
 	ga.Type = "model"
@@ -513,12 +521,29 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 			}
 		}
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
+	} else if ti, ok := tis[openAiClipVitL14Key2]; ok {
+		cond := GGUFArchitectureDiffusionConditioner{
+			Architecture: "OpenAI CLIP ViT-L/14",
+			FileType:     ti.GetFileType(),
+		}
+		if ti, ok = tis[openClipVitH14Key2]; ok {
+			cond = GGUFArchitectureDiffusionConditioner{
+				Architecture: "OpenCLIP ViT-H/14",
+				FileType:     ti.GetFileType(),
+			}
+		}
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, cond)
 	}
 	if ti, ok := tis[openClipVitG14Key]; ok {
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
 			Architecture: "OpenCLIP ViT-G/14",
 			FileType:     ti.GetFileType(),
 		})
+	} else if ti, ok = tis[openClipVitG14Key2]; ok {
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
+			Architecture: "OpenCLIP ViT-G/14",
+			FileType:     ti.GetFileType(),
+		})
 	}
 	if ti, ok := tis[t5xxlKey]; ok {
 		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
@@ -530,12 +555,23 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) {
 			Architecture: "Google T5-xxl",
 			FileType:     ti.GetFileType(),
 		})
+	} else if ti, ok = tis[t5xxlKey3]; ok {
+		ga.DiffusionConditioners = append(ga.DiffusionConditioners, GGUFArchitectureDiffusionConditioner{
+			Architecture: "Google T5-xxl",
+			FileType:     ti.GetFileType(),
+		})
 	}
 
-	if tis := gf.TensorInfos.Search(regexp.MustCompile(`^first_stage_model\..*`)); len(tis) != 0 {
-		ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{
-			Architecture: ga.DiffusionArchitecture + " VAE",
-			FileType:     GGUFTensorInfos(tis).GetFileType(),
+	for _, re := range []*regexp.Regexp{
+		regexp.MustCompile(`^first_stage_model\..*`),
+		regexp.MustCompile(`^decoder\.conv_in\..*`),
+	} {
+		if tis := gf.TensorInfos.Search(re); len(tis) != 0 {
+			ga.DiffusionAutoencoder = &GGUFArchitectureDiffusionAutoencoder{
+				Architecture: ga.DiffusionArchitecture + " VAE",
+				FileType:     GGUFTensorInfos(tis).GetFileType(),
+			}
+			break
 		}
 	}
 
diff --git a/file_estimate__llamacpp.go b/file_estimate__llamacpp.go
@@ -641,7 +641,7 @@ func (gf *GGUFFile) estimateLLaMACppRunInModel(o *_GGUFRunEstimateOptions, a *GG
 	// Computation.
 	{
 		// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243.
-		maxNodes := max(65536, uint64(5*len(gf.TensorInfos)))
+		maxNodes := max(1024, uint64(8*len(gf.TensorInfos)))
 
 		// Bootstrap, compute metadata.
 		cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)
diff --git a/file_metadata.go b/file_metadata.go
@@ -146,6 +146,9 @@ var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{
 	regexp.MustCompile(`^model\.diffusion_model\..*`),
 	regexp.MustCompile(`^double_blocks\..*`),
 	regexp.MustCompile(`^joint_blocks\..*`),
+	regexp.MustCompile(`^decoder\..*`),
+	regexp.MustCompile(`^encoder\..*`),
+	regexp.MustCompile(`^text_model\..*`),
 }
 
 // Metadata returns the metadata of the GGUF file.

Original file line number	Diff line number	Diff line change
`@@ -641,7 +641,7 @@ func (gf GGUFFile) estimateLLaMACppRunInModel(o _GGUFRunEstimateOptions, a *GG`
`641`	`641`	`// Computation.`
`642`	`642`	`{`
`643`	`643`	`// See https://github.com/ggml-org/llama.cpp/blob/ec9e0301fef6476df83e94842c3b625501c95566/src/llama-context.cpp#L1241-L1243.`
`644`		`- maxNodes := max(65536, uint64(5*len(gf.TensorInfos)))`
	`644`	`+ maxNodes := max(1024, uint64(8*len(gf.TensorInfos)))`
`645`	`645`
`646`	`646`	`// Bootstrap, compute metadata.`
`647`	`647`	`cm := GGMLTensorOverhead()*maxNodes + GGMLComputationGraphOverhead(maxNodes, false)`
Original file line number	Diff line number	Diff line change
`@@ -146,6 +146,9 @@ var _GGUFPotentialDiffusionArchitectureTensorsRegexes = []*regexp.Regexp{`
`146`	`146`	regexp.MustCompile(`^model\.diffusion_model\..*`),
`147`	`147`	regexp.MustCompile(`^double_blocks\..*`),
`148`	`148`	regexp.MustCompile(`^joint_blocks\..*`),
	`149`	+ regexp.MustCompile(`^decoder\..*`),
	`150`	+ regexp.MustCompile(`^encoder\..*`),
	`151`	+ regexp.MustCompile(`^text_model\..*`),
`149`	`152`	`}`
`150`	`153`
`151`	`154`	`// Metadata returns the metadata of the GGUF file.`