Skip to content

Commit 41bd944

Browse files
hntrlanadi45christian-bromann
authored
feat(openai): encodingFormat support for embeddings (#8916)
Co-authored-by: anadi45 <[email protected]> Co-authored-by: christian-bromann <[email protected]>
1 parent c10ea3e commit 41bd944

File tree

4 files changed

+71
-15
lines changed

4 files changed

+71
-15
lines changed

.changeset/fast-plants-wink.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
---
2+
"@langchain/openai": patch
3+
"@langchain/core": patch
4+
---
5+
6+
support base64 embeddings format

langchain-core/src/embeddings.ts

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -6,30 +6,32 @@ import { AsyncCaller, AsyncCallerParams } from "./utils/async_caller.js";
66
*/
77
export type EmbeddingsParams = AsyncCallerParams;
88

9-
export interface EmbeddingsInterface {
9+
export interface EmbeddingsInterface<TOutput = number[]> {
1010
/**
1111
* An abstract method that takes an array of documents as input and
1212
* returns a promise that resolves to an array of vectors for each
1313
* document.
1414
* @param documents An array of documents to be embedded.
1515
* @returns A promise that resolves to an array of vectors for each document.
1616
*/
17-
embedDocuments(documents: string[]): Promise<number[][]>;
17+
embedDocuments(documents: string[]): Promise<TOutput[]>;
1818

1919
/**
2020
* An abstract method that takes a single document as input and returns a
2121
* promise that resolves to a vector for the query document.
2222
* @param document A single document to be embedded.
2323
* @returns A promise that resolves to a vector for the query document.
2424
*/
25-
embedQuery(document: string): Promise<number[]>;
25+
embedQuery(document: string): Promise<TOutput>;
2626
}
2727

2828
/**
2929
* An abstract class that provides methods for embedding documents and
3030
* queries using LangChain.
3131
*/
32-
export abstract class Embeddings implements EmbeddingsInterface {
32+
export abstract class Embeddings<TOutput = number[]>
33+
implements EmbeddingsInterface<TOutput>
34+
{
3335
/**
3436
* The async caller should be used by subclasses to make any async calls,
3537
* which will thus benefit from the concurrency and retry logic.
@@ -47,13 +49,13 @@ export abstract class Embeddings implements EmbeddingsInterface {
4749
* @param documents An array of documents to be embedded.
4850
* @returns A promise that resolves to an array of vectors for each document.
4951
*/
50-
abstract embedDocuments(documents: string[]): Promise<number[][]>;
52+
abstract embedDocuments(documents: string[]): Promise<TOutput[]>;
5153

5254
/**
5355
* An abstract method that takes a single document as input and returns a
5456
* promise that resolves to a vector for the query document.
5557
* @param document A single document to be embedded.
5658
* @returns A promise that resolves to a vector for the query document.
5759
*/
58-
abstract embedQuery(document: string): Promise<number[]>;
60+
abstract embedQuery(document: string): Promise<TOutput>;
5961
}

libs/langchain-openai/src/embeddings.ts

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ import { type ClientOptions, OpenAI as OpenAIClient } from "openai";
22
import { getEnvironmentVariable } from "@langchain/core/utils/env";
33
import { Embeddings, type EmbeddingsParams } from "@langchain/core/embeddings";
44
import { chunkArray } from "@langchain/core/utils/chunk_array";
5-
import { OpenAICoreRequestOptions } from "./types.js";
65
import { getEndpoint, OpenAIEndpointConfig } from "./utils/azure.js";
76
import { wrapOpenAIClientError } from "./utils/openai.js";
87

@@ -50,6 +49,11 @@ export interface OpenAIEmbeddingsParams extends EmbeddingsParams {
5049
* See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
5150
*/
5251
stripNewLines?: boolean;
52+
53+
/**
54+
* The format to return the embeddings in. Can be either 'float' or 'base64'.
55+
*/
56+
encodingFormat?: "float" | "base64";
5357
}
5458

5559
/**
@@ -68,8 +72,8 @@ export interface OpenAIEmbeddingsParams extends EmbeddingsParams {
6872
*
6973
* ```
7074
*/
71-
export class OpenAIEmbeddings
72-
extends Embeddings
75+
export class OpenAIEmbeddings<TOutput = number[]>
76+
extends Embeddings<TOutput>
7377
implements Partial<OpenAIEmbeddingsParams>
7478
{
7579
model = "text-embedding-ada-002";
@@ -92,6 +96,8 @@ export class OpenAIEmbeddings
9296

9397
organization?: string;
9498

99+
encodingFormat?: "float" | "base64";
100+
95101
protected client: OpenAIClient;
96102

97103
protected clientConfig: ClientOptions;
@@ -130,6 +136,7 @@ export class OpenAIEmbeddings
130136
fieldsWithDefaults?.stripNewLines ?? this.stripNewLines;
131137
this.timeout = fieldsWithDefaults?.timeout;
132138
this.dimensions = fieldsWithDefaults?.dimensions;
139+
this.encodingFormat = fieldsWithDefaults?.encodingFormat;
133140

134141
this.clientConfig = {
135142
apiKey,
@@ -146,7 +153,7 @@ export class OpenAIEmbeddings
146153
* @param texts Array of documents to generate embeddings for.
147154
* @returns Promise that resolves to a 2D array of embeddings for each document.
148155
*/
149-
async embedDocuments(texts: string[]): Promise<number[][]> {
156+
async embedDocuments(texts: string[]): Promise<TOutput[]> {
150157
const batches = chunkArray(
151158
this.stripNewLines ? texts.map((t) => t.replace(/\n/g, " ")) : texts,
152159
this.batchSize
@@ -160,16 +167,19 @@ export class OpenAIEmbeddings
160167
if (this.dimensions) {
161168
params.dimensions = this.dimensions;
162169
}
170+
if (this.encodingFormat) {
171+
params.encoding_format = this.encodingFormat;
172+
}
163173
return this.embeddingWithRetry(params);
164174
});
165175
const batchResponses = await Promise.all(batchRequests);
166176

167-
const embeddings: number[][] = [];
177+
const embeddings: TOutput[] = [];
168178
for (let i = 0; i < batchResponses.length; i += 1) {
169179
const batch = batches[i];
170180
const { data: batchResponse } = batchResponses[i];
171181
for (let j = 0; j < batch.length; j += 1) {
172-
embeddings.push(batchResponse[j].embedding);
182+
embeddings.push(batchResponse[j].embedding as TOutput);
173183
}
174184
}
175185
return embeddings;
@@ -181,16 +191,19 @@ export class OpenAIEmbeddings
181191
* @param text Document to generate an embedding for.
182192
* @returns Promise that resolves to an embedding for the document.
183193
*/
184-
async embedQuery(text: string): Promise<number[]> {
194+
async embedQuery(text: string): Promise<TOutput> {
185195
const params: OpenAIClient.EmbeddingCreateParams = {
186196
model: this.model,
187197
input: this.stripNewLines ? text.replace(/\n/g, " ") : text,
188198
};
189199
if (this.dimensions) {
190200
params.dimensions = this.dimensions;
191201
}
202+
if (this.encodingFormat) {
203+
params.encoding_format = this.encodingFormat;
204+
}
192205
const { data } = await this.embeddingWithRetry(params);
193-
return data[0].embedding;
206+
return data[0].embedding as TOutput;
194207
}
195208

196209
/**
@@ -223,7 +236,8 @@ export class OpenAIEmbeddings
223236

224237
this.client = new OpenAIClient(params);
225238
}
226-
const requestOptions: OpenAICoreRequestOptions = {};
239+
const requestOptions = {};
240+
227241
return this.caller.call(async () => {
228242
try {
229243
const res = await this.client.embeddings.create(

libs/langchain-openai/src/tests/embeddings.int.test.ts

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,3 +91,37 @@ test("Test OpenAIEmbeddings.embedDocuments with v3 and dimensions", async () =>
9191
expect(res[0].length).toBe(127);
9292
expect(res[1].length).toBe(127);
9393
});
94+
95+
test("Test OpenAIEmbeddings.embedQuery with encodingFormat", async () => {
96+
const embeddings = new OpenAIEmbeddings({
97+
modelName: "text-embedding-3-small",
98+
encodingFormat: "float",
99+
});
100+
const res = await embeddings.embedQuery("Hello world");
101+
expect(typeof res[0]).toBe("number");
102+
expect(res.length).toBe(1536); // Default dimension for text-embedding-3-small
103+
});
104+
105+
test("Test OpenAIEmbeddings.embedDocuments with encodingFormat", async () => {
106+
const embeddings = new OpenAIEmbeddings({
107+
modelName: "text-embedding-3-small",
108+
encodingFormat: "float",
109+
});
110+
const res = await embeddings.embedDocuments(["Hello world", "Bye bye"]);
111+
expect(res).toHaveLength(2);
112+
expect(typeof res[0][0]).toBe("number");
113+
expect(typeof res[1][0]).toBe("number");
114+
expect(res[0].length).toBe(1536); // Default dimension for text-embedding-3-small
115+
expect(res[1].length).toBe(1536);
116+
});
117+
118+
test("Test OpenAIEmbeddings with encodingFormat and custom dimensions", async () => {
119+
const embeddings = new OpenAIEmbeddings({
120+
modelName: "text-embedding-3-small",
121+
encodingFormat: "float",
122+
dimensions: 256,
123+
});
124+
const res = await embeddings.embedQuery("Hello world");
125+
expect(typeof res[0]).toBe("number");
126+
expect(res.length).toBe(256); // Should respect custom dimensions
127+
});

0 commit comments

Comments
 (0)