Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion packages/tasks/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@
"prepare": "pnpm run build",
"check": "tsc",
"inference-codegen": "tsx scripts/inference-codegen.ts && prettier --write src/tasks/*/inference.ts",
"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json"
"inference-tgi-import": "tsx scripts/inference-tgi-import.ts && prettier --write src/tasks/text-generation/spec/*.json && prettier --write src/tasks/chat-completion/spec/*.json",
"inference-tei-import": "tsx scripts/inference-tei-import.ts && prettier --write src/tasks/feature-extraction/spec/*.json"
},
"type": "module",
"files": [
Expand Down
118 changes: 118 additions & 0 deletions packages/tasks/scripts/inference-tei-import.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Fetches TEI specs and generates JSON schema for input and output of
* text-embeddings (called feature-extraction).
* See https://huggingface.github.io/text-embeddings-inference/
*/
import fs from "fs/promises";
import * as path from "node:path/posix";
import { existsSync as pathExists } from "node:fs";
import type { JsonObject, JsonValue } from "type-fest";

const URL = "https://huggingface.github.io/text-embeddings-inference/openapi.json";

const rootDirFinder = function (): string {
let currentPath = path.normalize(import.meta.url);

while (currentPath !== "/") {
if (pathExists(path.join(currentPath, "package.json"))) {
return currentPath;
}

currentPath = path.normalize(path.join(currentPath, ".."));
}

return "/";
};

const rootDir = rootDirFinder();
const tasksDir = path.join(rootDir, "src", "tasks");

function toCamelCase(str: string, joiner = "") {
return str
.split(/[-_]/)
.map((part) => part.charAt(0).toUpperCase() + part.slice(1))
.join(joiner);
}

async function _extractAndAdapt(task: string, mainComponentName: string, type: "input" | "output" | "stream_output") {
console.debug(`✨ Importing`, task, type);

console.debug(" 📥 Fetching TEI specs");
const response = await fetch(URL);
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const openapi = (await response.json()) as any;
// eslint-disable-next-line @typescript-eslint/no-explicit-any
const components: Record<string, any> = openapi["components"]["schemas"];

// e.g. TextGeneration
const camelName = toCamelCase(task);
// e.g. TextGenerationInput
const camelFullName = camelName + toCamelCase(type);
const mainComponent = components[mainComponentName];
const filteredComponents: Record<string, JsonObject> = {};

function _scan(data: JsonValue) {
if (Array.isArray(data) || data instanceof Array) {
for (const item of data) {
_scan(item);
}
} else if (data && typeof data === "object") {
for (const key of Object.keys(data)) {
if (key === "$ref" && data[key] === "#/components/schemas/Input") {
// Special case: keep input as string or string[]
// but not Union[List[Union[List[int], int, str]], str]
// data.delete(key);
delete data[key];
data["type"] = "string";
data["description"] = "The text to embed.";
} else if (key === "$ref" && typeof data[key] === "string") {
// Verify reference exists
const ref = (data[key] as string).split("/").pop() ?? "";
if (!components[ref]) {
throw new Error(`Reference not found in components: ${data[key]}`);
}

// Add reference to components to export (and scan it too)
const newRef = camelFullName + ref.replace(camelName, "");
if (!filteredComponents[newRef]) {
components[ref]["title"] = newRef; // Rename title to avoid conflicts
filteredComponents[newRef] = components[ref];
_scan(components[ref]);
}

// Updating the reference to new format
data[key] = `#/$defs/${newRef}`;
} else {
_scan(data[key]);
}
}
}
}

console.debug(" 📦 Packaging jsonschema");
_scan(mainComponent);

const prettyName = toCamelCase(task, " ") + " " + toCamelCase(type, " ");
const inputSchema = {
$id: `/inference/schemas/${task}/${type}.json`,
$schema: "http://json-schema.org/draft-06/schema#",
description:
prettyName +
".\n\nAuto-generated from TEI specs." +
"\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
title: camelFullName,
type: mainComponent["type"],
required: mainComponent["required"],
properties: mainComponent["properties"],
$defs: filteredComponents,
items: mainComponent["items"],
};

const specPath = path.join(tasksDir, task, "spec", `${type}.json`);
console.debug(" 📂 Exporting", specPath);
await fs.writeFile(specPath, JSON.stringify(inputSchema, null, 4));
}

await _extractAndAdapt("feature-extraction", "EmbedRequest", "input");
await _extractAndAdapt("feature-extraction", "EmbedResponse", "output");
console.debug("✅ All done!");
2 changes: 1 addition & 1 deletion packages/tasks/scripts/inference-tgi-import.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Fetches TGI specs and generated JSON schema for input, output and stream_output of
* Fetches TGI specs and generates JSON schema for input, output and stream_output of
* text-generation and chat-completion tasks.
* See https://huggingface.github.io/text-generation-inference/
*/
Expand Down
28 changes: 23 additions & 5 deletions packages/tasks/src/tasks/feature-extraction/inference.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,37 @@
* Using src/scripts/inference-codegen
*/

export type FeatureExtractionOutput = unknown[];
export type FeatureExtractionOutput = Array<number[]>;

/**
* Inputs for Text Embedding inference
* Feature Extraction Input.
*
* Auto-generated from TEI specs.
* For more details, check out
* https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.
*/
export interface FeatureExtractionInput {
/**
* The text to get the embeddings of
* The text to embed.
*/
inputs: string;
normalize?: boolean;
/**
* Additional inference parameters
* The name of the prompt that should be used by for encoding. If not set, no prompt
* will be applied.
*
* Must be a key in the `Sentence Transformers` configuration `prompts` dictionary.
*
* For example if ``prompt_name`` is "query" and the ``prompts`` is {"query": "query: ",
* ...},
* then the sentence "What is the capital of France?" will be encoded as
* "query: What is the capital of France?" because the prompt text will be prepended before
* any text to encode.
*/
parameters?: { [key: string]: unknown };
prompt_name?: string;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess this is ok, although I'm a bit worried about the inconsistencies between TEI and inference API

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I would prefer not to change this for other tasks as well (we have exceptions for text-generation / chat-completion / feature-extraction now 😕 ). Next time we start building a new framework for a task let's think about specs straight away

truncate?: boolean;
truncation_direction?: FeatureExtractionInputTruncationDirection;
[property: string]: unknown;
}

export type FeatureExtractionInputTruncationDirection = "Left" | "Right";
47 changes: 34 additions & 13 deletions packages/tasks/src/tasks/feature-extraction/spec/input.json
Original file line number Diff line number Diff line change
@@ -1,26 +1,47 @@
{
"$id": "/inference/schemas/feature-extraction/input.json",
"$schema": "http://json-schema.org/draft-06/schema#",
"description": "Inputs for Text Embedding inference",
"description": "Feature Extraction Input.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

More generally, should we have a way to mark if an input is batchable?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a broader discussion for other inference types I think. At first we took the decision to avoid batched inputs in specs for simplicity. We can revisit if we see more demand for it but that's not the case yet for what I've seen (or marginally).

"title": "FeatureExtractionInput",
"type": "object",
"required": ["inputs"],
"properties": {
"inputs": {
"description": "The text to get the embeddings of",
"type": "string"
"type": "string",
"description": "The text to embed."
},
"parameters": {
"description": "Additional inference parameters",
"$ref": "#/$defs/FeatureExtractionParameters"
"normalize": {
"type": "boolean",
"default": "true",
"example": "true"
},
"prompt_name": {
"type": "string",
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be updated in TEI really

Suggested change
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `Sentence Transformers` configuration `prompts` dictionary.\n\nFor example if ``prompt_name`` is \"query\" and the ``prompts`` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",
"description": "The name of the prompt that should be used by for encoding. If not set, no prompt\nwill be applied.\n\nMust be a key in the `sentence-transformers` configuration `prompts` dictionary.\n\nFor example if `prompt_name` is \"query\" and the `prompts` is {\"query\": \"query: \", ...},\nthen the sentence \"What is the capital of France?\" will be encoded as\n\"query: What is the capital of France?\" because the prompt text will be prepended before\nany text to encode.",

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

"default": "null",
"example": "null",
"nullable": true
},
"truncate": {
"type": "boolean",
"default": "false",
"example": "false",
Comment on lines +27 to +28
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This refers to the truncation of inputs, correct? As in, whether inputs get truncated to e.g. 512 tokens? If so, should this not be "true" by default? In Sentence Transformer models, you can often get reduced performance when you exceed the recommended maximum sequence length.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This refers to the truncation of inputs, correct? As in, whether inputs get truncated to e.g. 512 tokens?

Yes

If so, should this not be "true" by default? In Sentence Transformer models, you can often get reduced performance when you exceed the recommended maximum sequence length.

Ping @OlivierDehaene here since it's more of a design choice in TEI

"nullable": true
},
"truncation_direction": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that this is implemented in TEI, but I don't think I've ever seen left truncation on embedding models (which are normally bidirectional and non-causal). It's fine to be prepared for a "more causal" future, though.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping @OlivierDehaene on that if you have more context
(I'll keep it here anyway since defined by TEI)

"allOf": [
{
"$ref": "#/$defs/FeatureExtractionInputTruncationDirection"
}
],
"default": "right"
}
},
"$defs": {
"FeatureExtractionParameters": {
"title": "FeatureExtractionParameters",
"description": "Additional inference parameters for Feature Extraction",
"type": "object",
"properties": {}
"FeatureExtractionInputTruncationDirection": {
"type": "string",
"enum": ["Left", "Right"],
"title": "FeatureExtractionInputTruncationDirection"
}
},
"required": ["inputs"]
}
}
12 changes: 10 additions & 2 deletions packages/tasks/src/tasks/feature-extraction/spec/output.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,15 @@
{
"$id": "/inference/schemas/feature-extraction/output.json",
"$schema": "http://json-schema.org/draft-06/schema#",
"description": "The embedding for the input text, as a nested list (tensor) of floats",
"description": "Feature Extraction Output.\n\nAuto-generated from TEI specs.\nFor more details, check out https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-tei-import.ts.",
"title": "FeatureExtractionOutput",
"type": "array",
"title": "FeatureExtractionOutput"
"$defs": {},
"items": {
"type": "array",
"items": {
"type": "number",
"format": "float"
}
}
}