Merge pull request #593 from NexaAI/feat/mengsheng/image-gen-serve

mengshengwu · web-flow · commit b14b7bc929e3 · 2025-09-26T13:51:08.000+08:00
feat: add image generation support for server
diff --git a/runner/Makefile b/runner/Makefile
@@ -23,7 +23,7 @@ else
 	UNZIP   := unzip -q
 endif
 
-.PHONY: build link xcopy download clean
+.PHONY: build link xcopy download clean serve
 
 build:
 	go build \
@@ -58,3 +58,6 @@ download: clean
 
 clean:
 	-$(RM) build
+
+serve:
+	./build/nexa serve
diff --git a/runner/nexa-sdk/image_gen.go b/runner/nexa-sdk/image_gen.go
@@ -379,6 +379,13 @@ func (ig *ImageGen) Destroy() error {
 	return nil
 }
 
+// Reset resets the ImageGen internal state (no-op for image generation)
+func (ig *ImageGen) Reset() error {
+	slog.Debug("Reset called", "ptr", ig.ptr)
+	// Image generation doesn't maintain state between generations, so this is a no-op
+	return nil
+}
+
 // Txt2Img generates an image from text prompt
 func (ig *ImageGen) Txt2Img(input ImageGenTxt2ImgInput) (ImageGenOutput, error) {
 	slog.Debug("Txt2Img called", "input", input)
diff --git a/runner/server/docs/swagger.yaml b/runner/server/docs/swagger.yaml
@@ -63,6 +63,43 @@ paths:
               schema:
                 $ref: '#/components/schemas/EmbeddingResponse'
 
+  /v1/images/generations:
+    post:
+      summary: Creates an image given a prompt
+      description: Creates an image given a prompt. This endpoint follows OpenAI DALL-E 3 API specification for compatibility.
+      operationId: PostV1ImagesGenerations
+      requestBody:
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: '#/components/schemas/ImageGenerationRequest'
+      responses:
+        '200':
+          description: Successful image generation response
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ImageGenerationResponse'
+        '400':
+          description: Bad request - invalid parameters
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ErrorResponse'
+        '404':
+          description: Model not found
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ErrorResponse'
+        '500':
+          description: Internal server error
+          content:
+            application/json:
+              schema:
+                $ref: '#/components/schemas/ErrorResponse'
+
   /v1/reranking:
     post:
       summary: Reranks the given documents for the given query
@@ -581,6 +618,75 @@ components:
           type: integer
           description: The index of the embedding in the list of embeddings
 
+    # ---------- Image Generation ----------
+    ImageGenerationRequest:
+      type: object
+      required: [model, prompt]
+      properties:
+        model:
+          type: string
+          description: ID of the model to use
+          default: "nexaml/sdxl-turbo-ryzen-ai"
+        prompt:
+          type: string
+          description: A text description of the desired image(s). The maximum length is 1000 characters.
+          default: "A white cat with red eyes"
+        n:
+          type: integer
+          minimum: 1
+          maximum: 10
+          description: The number of images to generate. Must be between 1 and 10.
+          default: 1
+        size:
+          type: string
+          enum: ["512x512", "1024x1024", "1792x1024", "1024x1792"]
+          description: The size of the generated images. Must be one of the supported sizes.
+          default: "512x512"
+        quality:
+          type: string
+          enum: ["standard", "hd"]
+          description: The quality of the image that will be generated
+          default: "standard"
+        style:
+          type: string
+          enum: ["vivid", "natural"]
+          description: The style of the generated images
+          default: "vivid"
+        response_format:
+          type: string
+          enum: ["url", "b64_json"]
+          description: The format in which the generated images are returned
+          default: "url"
+        user:
+          type: string
+          description: A unique identifier representing your end-user
+
+    ImageGenerationResponse:
+      type: object
+      required: [created, data]
+      properties:
+        created:
+          type: integer
+          description: The Unix timestamp (in seconds) of when the image was created
+        data:
+          type: array
+          items:
+            $ref: '#/components/schemas/ImageGenerationData'
+          description: The list of generated images
+
+    ImageGenerationData:
+      type: object
+      properties:
+        url:
+          type: string
+          description: The URL of the generated image, if response_format is "url"
+        b64_json:
+          type: string
+          description: The base64-encoded JSON of the generated image, if response_format is "b64_json"
+        revised_prompt:
+          type: string
+          description: The prompt that was used to generate the image, if there was a revision to the prompt
+
     # ---------- Reranking ----------
     RerankingRequest:
       type: object
@@ -643,6 +749,14 @@ components:
           description: Additional metadata about the document
 
     # ---------- Common ----------
+    ErrorResponse:
+      type: object
+      required: [error]
+      properties:
+        error:
+          type: string
+          description: Error message describing what went wrong
+
     TokenUsage:
       type: object
       properties:
diff --git a/runner/server/handler/image.go b/runner/server/handler/image.go
@@ -0,0 +1,173 @@
+package handler
+
+import (
+	"encoding/base64"
+	"errors"
+	"fmt"
+	"log/slog"
+	"net/http"
+	"os"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/gin-gonic/gin"
+	"github.com/openai/openai-go"
+
+	"github.com/NexaAI/nexa-sdk/runner/internal/types"
+	nexa_sdk "github.com/NexaAI/nexa-sdk/runner/nexa-sdk"
+	"github.com/NexaAI/nexa-sdk/runner/server/service"
+)
+
+// @Router			/images/generations [post]
+// @Summary		Creates an image given a prompt.
+// @Description	Creates an image given a prompt. This endpoint follows OpenAI DALL-E 3 API specification for compatibility.
+// @Accept			json
+// @Param			request	body	openai.ImageGenerateParams	true	"Image generation request"
+// @Produce		json
+// @Success		200	{object}	openai.ImagesResponse	"Successful image generation response"
+// @Failure		400	{object}	map[string]any	"Bad request - invalid parameters"
+// @Failure		404	{object}	map[string]any	"Model not found"
+// @Failure		500	{object}	map[string]any	"Internal server error"
+func ImageGenerations(c *gin.Context) {
+	param := openai.ImageGenerateParams{}
+	if err := c.ShouldBindJSON(&param); err != nil {
+		slog.Error("Failed to bind JSON request", "error", err)
+		c.JSON(http.StatusBadRequest, map[string]any{"error": err.Error()})
+		return
+	}
+
+	slog.Info("Image generation request received",
+		"model", param.Model,
+		"prompt_length", len(param.Prompt),
+		"n", param.N,
+		"size", param.Size)
+
+	if param.N.Value == 0 {
+		param.N.Value = 1
+	}
+	if param.Size == "" {
+		param.Size = openai.ImageGenerateParamsSize256x256
+	}
+	if param.ResponseFormat == "" {
+		param.ResponseFormat = openai.ImageGenerateParamsResponseFormatURL
+	}
+
+	imageGen, err := service.KeepAliveGet[nexa_sdk.ImageGen](
+		param.Model,
+		types.ModelParam{},
+		c.GetHeader("Nexa-KeepCache") != "true",
+	)
+	if err != nil {
+		c.JSON(http.StatusInternalServerError, map[string]any{"error": err.Error()})
+		return
+	}
+
+	width, height, err := parseImageSize(string(param.Size))
+	if err != nil {
+		c.JSON(http.StatusBadRequest, map[string]any{"error": err.Error()})
+		return
+	}
+
+	var imageData []openai.Image
+	n := int(param.N.Value)
+	slog.Info("Starting image generation", "count", n, "size", string(param.Size))
+	for i := range n {
+		outputPath := fmt.Sprintf("imagegen_output_%d.png", time.Now().UnixNano())
+		slog.Debug("Generating image", "index", i, "output_path", outputPath)
+
+		config := &nexa_sdk.ImageGenerationConfig{
+			Prompts:         []string{param.Prompt},
+			NegativePrompts: []string{"blurry, low quality, distorted, low resolution"},
+			Height:          height,
+			Width:           width,
+			SamplerConfig: nexa_sdk.ImageSamplerConfig{
+				Method:        "ddim",
+				Steps:         20,
+				GuidanceScale: 7.5,
+				Eta:           0.0,
+				Seed:          int32(time.Now().UnixNano() % 1000000),
+			},
+			SchedulerConfig: nexa_sdk.SchedulerConfig{
+				Type:              "ddim",
+				NumTrainTimesteps: 1000,
+				StepsOffset:       1,
+				BetaStart:         0.00085,
+				BetaEnd:           0.012,
+				BetaSchedule:      "scaled_linear",
+				PredictionType:    "epsilon",
+				TimestepType:      "discrete",
+				TimestepSpacing:   "leading",
+				InterpolationType: "linear",
+				ConfigPath:        "",
+			},
+			Strength: 1.0,
+		}
+
+		result, err := imageGen.Txt2Img(nexa_sdk.ImageGenTxt2ImgInput{
+			PromptUTF8: param.Prompt,
+			Config:     config,
+			OutputPath: outputPath,
+		})
+		if err != nil {
+			c.JSON(http.StatusInternalServerError, map[string]any{"error": fmt.Sprintf("image generation failed: %v", err)})
+			return
+		}
+
+		data := openai.Image{
+			RevisedPrompt: param.Prompt,
+		}
+
+		if param.ResponseFormat == openai.ImageGenerateParamsResponseFormatB64JSON {
+			b64Data, err := encodeImageToBase64(result.OutputImagePath)
+			os.Remove(result.OutputImagePath)
+			if err != nil {
+				c.JSON(http.StatusInternalServerError, map[string]any{"error": fmt.Sprintf("failed to encode image: %v", err)})
+				return
+			}
+			data.B64JSON = b64Data
+		} else {
+			data.URL = result.OutputImagePath
+		}
+
+		imageData = append(imageData, data)
+		slog.Info("Image generated successfully", "index", i, "output_path", result.OutputImagePath)
+	}
+
+	response := openai.ImagesResponse{
+		Created: time.Now().Unix(),
+		Data:    imageData,
+	}
+
+	slog.Info("Image generation completed successfully", "total_images", len(imageData))
+	c.JSON(http.StatusOK, response)
+}
+
+func parseImageSize(size string) (int32, int32, error) {
+	parts := strings.Split(size, "x")
+	if len(parts) != 2 {
+		return 0, 0, errors.New("invalid size format")
+	}
+
+	width, err := strconv.Atoi(parts[0])
+	if err != nil {
+		return 0, 0, errors.New("invalid width")
+	}
+
+	height, err := strconv.Atoi(parts[1])
+	if err != nil {
+		return 0, 0, errors.New("invalid height")
+	}
+
+	return int32(width), int32(height), nil
+}
+
+func encodeImageToBase64(imagePath string) (string, error) {
+	imageData, err := os.ReadFile(imagePath)
+	if err != nil {
+		return "", fmt.Errorf("failed to read image file: %v", err)
+	}
+	mimeType := http.DetectContentType(imageData)
+	base64String := base64.StdEncoding.EncodeToString(imageData)
+	return fmt.Sprintf("data:%s;base64,%s", mimeType, base64String), nil
+}
diff --git a/runner/server/route.go b/runner/server/route.go
@@ -33,6 +33,8 @@ func RegisterAPIv1(r *gin.Engine) {
 
 	g.POST("/embeddings", handler.Embeddings)
 
+	g.POST("/images/generations", handler.ImageGenerations)
+
 	//g.POST("/reranking", handler.Reranking)
 
 	g.GET("/models/*model", handler.RetrieveModel)
diff --git a/runner/server/service/keepalive.go b/runner/server/service/keepalive.go
@@ -127,7 +127,7 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			break
 		}
 	}
-
+	
 	var t keepable
 	var e error
 	switch reflect.TypeFor[T]() {
@@ -168,6 +168,15 @@ func keepAliveGet[T any](name string, param types.ModelParam, reset bool) (any,
 			PluginID:  manifest.PluginId,
 			DeviceID:  manifest.DeviceId,
 		})
+	case reflect.TypeFor[nexa_sdk.ImageGen]():
+		// For image generation models, use the model directory path instead of specific file
+		modelDir := s.ModelfilePath(manifest.Name, "")
+		t, e = nexa_sdk.NewImageGen(nexa_sdk.ImageGenCreateInput{
+			ModelName: manifest.ModelName,
+			ModelPath: modelDir,
+			PluginID:  manifest.PluginId,
+			DeviceID:  manifest.DeviceId,
+		})
 	//case reflect.TypeFor[nexa_sdk.Reranker]():
 	//	t, e = nexa_sdk.NewReranker(modelfile, nil, param.Device)
 	//case reflect.TypeFor[nexa_sdk.TTS]():