huggingface · Wauplin · Jul 12, 2024 · Jul 1, 2024 · Jul 1, 2024 · osanseviero
@@ -28,7 +28,8 @@
 		"prepare": "pnpm run build",
 		"check": "tsc",
 		"inference-codegen": "tsx scripts/inference-codegen.ts && prettier --write src/tasks/*/inference.ts",
-		"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json"
+		"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json",
+		"inference-tei-import": "tsx scripts/inference-tei-import.ts && prettier --write src/tasks/feature-extraction/spec/*.json"
 	},
 	"type": "module",
 	"files": [

@@ -0,0 +1,118 @@
+/*
+ * Fetches TEI specs and generates JSON schema for input and output of
+ * text-embeddings (called feature-extraction).
+ * See https://huggingface.github.io/text-embeddings-inference/
+ */
+import fs from "fs/promises";
+import * as path from "node:path/posix";
+import { existsSync as pathExists } from "node:fs";
+import type { JsonObject, JsonValue } from "type-fest";
+
+const URL = "https://huggingface.github.io/text-embeddings-inference/openapi.json";
+
+const rootDirFinder = function (): string {
+	let currentPath = path.normalize(import.meta.url);
+
+	while (currentPath !== "/") {
+		if (pathExists(path.join(currentPath, "package.json"))) {
+			return currentPath;
+		}
+
+		currentPath = path.normalize(path.join(currentPath, ".."));
+	}
+
+	return "/";
+};
+
+const rootDir = rootDirFinder();
+const tasksDir = path.join(rootDir, "src", "tasks");
+
+function toCamelCase(str: string, joiner = "") {
+	return str
+		.split(/[-_]/)
+		.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
+		.join(joiner);
+}
+
+async function _extractAndAdapt(task: string, mainComponentName: string, type: "input" | "output" | "stream_output") {
+	console.debug(`✨ Importing`, task, type);
+
+	console.debug("   📥 Fetching TEI specs");
+	const response = await fetch(URL);
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	const openapi = (await response.json()) as any;
+	// eslint-disable-next-line @typescript-eslint/no-explicit-any
+	const components: Record<string, any> = openapi["components"]["schemas"];
+
+	// e.g. TextGeneration
+	const camelName = toCamelCase(task);
+	// e.g. TextGenerationInput
+	const camelFullName = camelName + toCamelCase(type);
+	const mainComponent = components[mainComponentName];
+	const filteredComponents: Record<string, JsonObject> = {};
+
+	function _scan(data: JsonValue) {
+		if (Array.isArray(data) || data instanceof Array) {
+			for (const item of data) {
+				_scan(item);
+			}
+		} else if (data && typeof data === "object") {
+			for (const key of Object.keys(data)) {
+				if (key === "$ref" && data[key] === "#/components/schemas/Input") {
+					// Special case: keep input as string or string[]
+					// but not Union[List[Union[List[int], int, str]], str]
+					// data.delete(key);
+					delete data[key];
+					data["type"] = "string";
+					data["description"] = "The text to embed.";
+				} else if (key === "$ref" && typeof data[key] === "string") {
+					// Verify reference exists
+					const ref = (data[key] as string).split("/").pop() ?? "";
+					if (!components[ref]) {
+						throw new Error(`Reference not found in components: ${data[key]}`);
+					}
+
+					// Add reference to components to export (and scan it too)
+					const newRef = camelFullName + ref.replace(camelName, "");
+					if (!filteredComponents[newRef]) {
+						components[ref]["title"] = newRef; // Rename title to avoid conflicts
+						filteredComponents[newRef] = components[ref];
+						_scan(components[ref]);
+					}
+
+					// Updating the reference to new format
+					data[key] = `#/$defs/${newRef}`;
+				} else {
+					_scan(data[key]);
+				}
+			}
+		}
+	}
+
+	console.debug("   📦 Packaging jsonschema");
+	_scan(mainComponent);
+
+	const prettyName = toCamelCase(task, " ") + " " + toCamelCase(type, " ");
+	const inputSchema = {
+		$id: `/inference/schemas/${task}/${type}.json`,
+		$schema: "http://json-schema.org/draft-06/schema#",
+		description:
+			prettyName +
+			".\n\nAuto-generated from TEI specs." +
+			"\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
+		title: camelFullName,
+		type: mainComponent["type"],
+		required: mainComponent["required"],
+		properties: mainComponent["properties"],
+		$defs: filteredComponents,
+		items: mainComponent["items"],
+	};
+
+	const specPath = path.join(tasksDir, task, "spec", `${type}.json`);
+	console.debug("   📂 Exporting", specPath);
+	await fs.writeFile(specPath, JSON.stringify(inputSchema, null, 4));
+}
+
+await _extractAndAdapt("feature-extraction", "EmbedRequest", "input");
+await _extractAndAdapt("feature-extraction", "EmbedResponse", "output");
+console.debug("✅ All done!");
@@ -1,5 +1,5 @@
 /*
- * Fetches TGI specs and generated JSON schema for input, output and stream_output of
+ * Fetches TGI specs and generates JSON schema for input, output and stream_output of
  * text-generation and chat-completion tasks.
  * See https://huggingface.github.io/text-generation-inference/
  */

@@ -4,19 +4,37 @@
  * Using src/scripts/inference-codegen
  */
 
-export type FeatureExtractionOutput = unknown[];
+export type FeatureExtractionOutput = Array<number[]>;
 
 /**
- * Inputs for Text Embedding inference
+ * Feature Extraction Input.
+ *
+ * Auto-generated from TEI specs.
+ * For more details, check out
+ * https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
  */
 export interface FeatureExtractionInput {
 	/**
-	 * The text to get the embeddings of
+	 * The text to embed.
 	 */
 	inputs: string;
+	normalize?: boolean;
 	/**
-	 * Additional inference parameters
+	 * The name of the prompt that should be used by for encoding. If not set, no prompt
+	 * will be applied.
+	 *
+	 * Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
+	 *
+	 * For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
+	 * ...},
+	 * then the sentence "What is the capital of France?" will be encoded as
+	 * "query: What is the capital of France?" because the prompt text will be prepended before
+	 * any text to encode.
 	 */
-	parameters?: { [key: string]: unknown };
+	prompt_name?: string;
+	truncate?: boolean;
+	truncation_direction?: FeatureExtractionInputTruncationDirection;
 	[property: string]: unknown;
 }
+
+export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
@@ -1,26 +1,47 @@
 {
 	"$id": "/inference/schemas/feature-extraction/input.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "Inputs for Text Embedding inference",
+	"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
 	"title": "FeatureExtractionInput",
 	"type": "object",
+	"required": ["inputs"],
 	"properties": {
 		"inputs": {
-			"description": "The text to get the embeddings of",
-			"type": "string"
+			"type": "string",
+			"description": "The text to embed."
 		},
-		"parameters": {
-			"description": "Additional inference parameters",
-			"$ref": "#/$defs/FeatureExtractionParameters"
+		"normalize": {
+			"type": "boolean",
+			"default": "true",
+			"example": "true"
+		},
+		"prompt_name": {
+			"type": "string",
+			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
-			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
+			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `sentence-transformers` configuration `prompts` dictionary.\n\nFor example if `prompt_name` is \"query\" and the `prompts` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
-			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
+			"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `sentence-transformers` configuration `prompts` dictionary.\n\nFor example if `prompt_name` is \"query\" and the `prompts` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
+			"default": "null",
+			"example": "null",
+			"nullable": true
+		},
+		"truncate": {
+			"type": "boolean",
+			"default": "false",
+			"example": "false",
+			"nullable": true
+		},
+		"truncation_direction": {
+			"allOf": [
+				{
+					"$ref": "#/$defs/FeatureExtractionInputTruncationDirection"
+				}
+			],
+			"default": "right"
 		}
 	},
 	"$defs": {
-		"FeatureExtractionParameters": {
-			"title": "FeatureExtractionParameters",
-			"description": "Additional inference parameters for Feature Extraction",
-			"type": "object",
-			"properties": {}
+		"FeatureExtractionInputTruncationDirection": {
+			"type": "string",
+			"enum": ["Left", "Right"],
+			"title": "FeatureExtractionInputTruncationDirection"
 		}
-	},
-	"required": ["inputs"]
+	}
 }
@@ -1,7 +1,15 @@
 {
 	"$id": "/inference/schemas/feature-extraction/output.json",
 	"$schema": "http://json-schema.org/draft-06/schema#",
-	"description": "The embedding for the input text, as a nested list (tensor) of floats",
+	"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
+	"title": "FeatureExtractionOutput",
 	"type": "array",
-	"title": "FeatureExtractionOutput"
+	"$defs": {},
+	"items": {
+		"type": "array",
+		"items": {
+			"type": "number",
+			"format": "float"
+		}
+	}
 }