-
Notifications
You must be signed in to change notification settings - Fork 521
Import feature-extraction
inference type from TEI
#781
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,118 @@ | ||
/* | ||
* Fetches TEI specs and generates JSON schema for input and output of | ||
* text-embeddings (called feature-extraction). | ||
* See https://huggingface.github.io/text-embeddings-inference/ | ||
*/ | ||
import fs from "fs/promises"; | ||
import * as path from "node:path/posix"; | ||
import { existsSync as pathExists } from "node:fs"; | ||
import type { JsonObject, JsonValue } from "type-fest"; | ||
|
||
const URL = "https://huggingface.github.io/text-embeddings-inference/openapi.json"; | ||
|
||
const rootDirFinder = function (): string { | ||
let currentPath = path.normalize(import.meta.url); | ||
|
||
while (currentPath !== "/") { | ||
if (pathExists(path.join(currentPath, "package.json"))) { | ||
return currentPath; | ||
} | ||
|
||
currentPath = path.normalize(path.join(currentPath, "..")); | ||
} | ||
|
||
return "/"; | ||
}; | ||
|
||
const rootDir = rootDirFinder(); | ||
const tasksDir = path.join(rootDir, "src", "tasks"); | ||
|
||
function toCamelCase(str: string, joiner = "") { | ||
return str | ||
.split(/[-_]/) | ||
.map((part) => part.charAt(0).toUpperCase() + part.slice(1)) | ||
.join(joiner); | ||
} | ||
|
||
async function _extractAndAdapt(task: string, mainComponentName: string, type: "input" | "output" | "stream_output") { | ||
console.debug(`✨ Importing`, task, type); | ||
|
||
console.debug(" 📥 Fetching TEI specs"); | ||
const response = await fetch(URL); | ||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
const openapi = (await response.json()) as any; | ||
// eslint-disable-next-line @typescript-eslint/no-explicit-any | ||
const components: Record<string, any> = openapi["components"]["schemas"]; | ||
|
||
// e.g. TextGeneration | ||
const camelName = toCamelCase(task); | ||
// e.g. TextGenerationInput | ||
const camelFullName = camelName + toCamelCase(type); | ||
const mainComponent = components[mainComponentName]; | ||
const filteredComponents: Record<string, JsonObject> = {}; | ||
|
||
function _scan(data: JsonValue) { | ||
if (Array.isArray(data) || data instanceof Array) { | ||
for (const item of data) { | ||
_scan(item); | ||
} | ||
} else if (data && typeof data === "object") { | ||
for (const key of Object.keys(data)) { | ||
if (key === "$ref" && data[key] === "#/components/schemas/Input") { | ||
// Special case: keep input as string or string[] | ||
// but not Union[List[Union[List[int], int, str]], str] | ||
// data.delete(key); | ||
delete data[key]; | ||
data["type"] = "string"; | ||
data["description"] = "The text to embed."; | ||
} else if (key === "$ref" && typeof data[key] === "string") { | ||
// Verify reference exists | ||
const ref = (data[key] as string).split("/").pop() ?? ""; | ||
if (!components[ref]) { | ||
throw new Error(`Reference not found in components: ${data[key]}`); | ||
} | ||
|
||
// Add reference to components to export (and scan it too) | ||
const newRef = camelFullName + ref.replace(camelName, ""); | ||
if (!filteredComponents[newRef]) { | ||
components[ref]["title"] = newRef; // Rename title to avoid conflicts | ||
filteredComponents[newRef] = components[ref]; | ||
_scan(components[ref]); | ||
} | ||
|
||
// Updating the reference to new format | ||
data[key] = `#/$defs/${newRef}`; | ||
} else { | ||
_scan(data[key]); | ||
} | ||
} | ||
} | ||
} | ||
|
||
console.debug(" 📦 Packaging jsonschema"); | ||
_scan(mainComponent); | ||
|
||
const prettyName = toCamelCase(task, " ") + " " + toCamelCase(type, " "); | ||
const inputSchema = { | ||
$id: `/inference/schemas/${task}/${type}.json`, | ||
$schema: "http://json-schema.org/draft-06/schema#", | ||
description: | ||
prettyName + | ||
".\n\nAuto-generated from TEI specs." + | ||
"\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.", | ||
title: camelFullName, | ||
type: mainComponent["type"], | ||
required: mainComponent["required"], | ||
properties: mainComponent["properties"], | ||
$defs: filteredComponents, | ||
items: mainComponent["items"], | ||
}; | ||
|
||
const specPath = path.join(tasksDir, task, "spec", `${type}.json`); | ||
console.debug(" 📂 Exporting", specPath); | ||
await fs.writeFile(specPath, JSON.stringify(inputSchema, null, 4)); | ||
} | ||
|
||
await _extractAndAdapt("feature-extraction", "EmbedRequest", "input"); | ||
await _extractAndAdapt("feature-extraction", "EmbedResponse", "output"); | ||
console.debug("✅ All done!"); |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,26 +1,47 @@ | ||||||
{ | ||||||
"$id": "/inference/schemas/feature-extraction/input.json", | ||||||
"$schema": "http://json-schema.org/draft-06/schema#", | ||||||
"description": "Inputs for Text Embedding inference", | ||||||
"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. More generally, should we have a way to mark if an input is batchable? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a broader discussion for other inference types I think. At first we took the decision to avoid batched inputs in specs for simplicity. We can revisit if we see more demand for it but that's not the case yet for what I've seen (or marginally). |
||||||
"title": "FeatureExtractionInput", | ||||||
"type": "object", | ||||||
"required": ["inputs"], | ||||||
"properties": { | ||||||
"inputs": { | ||||||
"description": "The text to get the embeddings of", | ||||||
"type": "string" | ||||||
"type": "string", | ||||||
"description": "The text to embed." | ||||||
}, | ||||||
"parameters": { | ||||||
"description": "Additional inference parameters", | ||||||
"$ref": "#/$defs/FeatureExtractionParameters" | ||||||
"normalize": { | ||||||
"type": "boolean", | ||||||
"default": "true", | ||||||
"example": "true" | ||||||
}, | ||||||
"prompt_name": { | ||||||
"type": "string", | ||||||
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.", | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. To be updated in TEI really
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opened a PR: huggingface/text-embeddings-inference#342 |
||||||
"default": "null", | ||||||
"example": "null", | ||||||
"nullable": true | ||||||
}, | ||||||
"truncate": { | ||||||
"type": "boolean", | ||||||
"default": "false", | ||||||
"example": "false", | ||||||
Comment on lines
+27
to
+28
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This refers to the truncation of inputs, correct? As in, whether inputs get truncated to e.g. 512 tokens? If so, should this not be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yes
Ping @OlivierDehaene here since it's more of a design choice in TEI |
||||||
"nullable": true | ||||||
}, | ||||||
"truncation_direction": { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I understand that this is implemented in TEI, but I don't think I've ever seen left truncation on embedding models (which are normally bidirectional and non-causal). It's fine to be prepared for a "more causal" future, though. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping @OlivierDehaene on that if you have more context |
||||||
"allOf": [ | ||||||
{ | ||||||
"$ref": "#/$defs/FeatureExtractionInputTruncationDirection" | ||||||
} | ||||||
], | ||||||
"default": "right" | ||||||
} | ||||||
}, | ||||||
"$defs": { | ||||||
"FeatureExtractionParameters": { | ||||||
"title": "FeatureExtractionParameters", | ||||||
"description": "Additional inference parameters for Feature Extraction", | ||||||
"type": "object", | ||||||
"properties": {} | ||||||
"FeatureExtractionInputTruncationDirection": { | ||||||
"type": "string", | ||||||
"enum": ["Left", "Right"], | ||||||
"title": "FeatureExtractionInputTruncationDirection" | ||||||
} | ||||||
}, | ||||||
"required": ["inputs"] | ||||||
} | ||||||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,15 @@ | ||
{ | ||
"$id": "/inference/schemas/feature-extraction/output.json", | ||
"$schema": "http://json-schema.org/draft-06/schema#", | ||
"description": "The embedding for the input text, as a nested list (tensor) of floats", | ||
"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.", | ||
"title": "FeatureExtractionOutput", | ||
"type": "array", | ||
"title": "FeatureExtractionOutput" | ||
"$defs": {}, | ||
"items": { | ||
"type": "array", | ||
"items": { | ||
"type": "number", | ||
"format": "float" | ||
} | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess this is ok, although I'm a bit worried about the inconsistencies between TEI and inference API
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I would prefer not to change this for other tasks as well (we have exceptions for text-generation / chat-completion / feature-extraction now 😕 ). Next time we start building a new framework for a task let's think about specs straight away