Skip to content

Commit b62c7ca

Browse files
berry-13danny-avila
authored andcommitted
🗣️ feat: add support for gpt-4o-transcribe models (#6483)
1 parent 7cb9f85 commit b62c7ca

File tree

2 files changed

+124
-5
lines changed

2 files changed

+124
-5
lines changed

api/server/services/Files/Audio/STTService.js

Lines changed: 75 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,78 @@ const { getCustomConfig } = require('~/server/services/Config');
77
const { genAzureEndpoint } = require('~/utils');
88
const { logger } = require('~/config');
99

10+
/**
11+
* Maps MIME types to their corresponding file extensions for audio files.
12+
* @type {Object}
13+
*/
14+
const MIME_TO_EXTENSION_MAP = {
15+
// MP4 container formats
16+
'audio/mp4': 'm4a',
17+
'audio/x-m4a': 'm4a',
18+
// Ogg formats
19+
'audio/ogg': 'ogg',
20+
'audio/vorbis': 'ogg',
21+
'application/ogg': 'ogg',
22+
// Wave formats
23+
'audio/wav': 'wav',
24+
'audio/x-wav': 'wav',
25+
'audio/wave': 'wav',
26+
// MP3 formats
27+
'audio/mp3': 'mp3',
28+
'audio/mpeg': 'mp3',
29+
'audio/mpeg3': 'mp3',
30+
// WebM formats
31+
'audio/webm': 'webm',
32+
// Additional formats
33+
'audio/flac': 'flac',
34+
'audio/x-flac': 'flac',
35+
};
36+
37+
/**
38+
* Gets the file extension from the MIME type.
39+
* @param {string} mimeType - The MIME type.
40+
* @returns {string} The file extension.
41+
*/
42+
function getFileExtensionFromMime(mimeType) {
43+
// Default fallback
44+
if (!mimeType) {
45+
return 'webm';
46+
}
47+
48+
// Direct lookup (fastest)
49+
const extension = MIME_TO_EXTENSION_MAP[mimeType];
50+
if (extension) {
51+
return extension;
52+
}
53+
54+
// Try to extract subtype as fallback
55+
const subtype = mimeType.split('/')[1]?.toLowerCase();
56+
57+
// If subtype matches a known extension
58+
if (['mp3', 'mp4', 'ogg', 'wav', 'webm', 'm4a', 'flac'].includes(subtype)) {
59+
return subtype === 'mp4' ? 'm4a' : subtype;
60+
}
61+
62+
// Generic checks for partial matches
63+
if (subtype?.includes('mp4') || subtype?.includes('m4a')) {
64+
return 'm4a';
65+
}
66+
if (subtype?.includes('ogg')) {
67+
return 'ogg';
68+
}
69+
if (subtype?.includes('wav')) {
70+
return 'wav';
71+
}
72+
if (subtype?.includes('mp3') || subtype?.includes('mpeg')) {
73+
return 'mp3';
74+
}
75+
if (subtype?.includes('webm')) {
76+
return 'webm';
77+
}
78+
79+
return 'webm'; // Default fallback
80+
}
81+
1082
/**
1183
* Service class for handling Speech-to-Text (STT) operations.
1284
* @class
@@ -170,8 +242,10 @@ class STTService {
170242
throw new Error('Invalid provider');
171243
}
172244

245+
const fileExtension = getFileExtensionFromMime(audioFile.mimetype);
246+
173247
const audioReadStream = Readable.from(audioBuffer);
174-
audioReadStream.path = 'audio.wav';
248+
audioReadStream.path = `audio.${fileExtension}`;
175249

176250
const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile);
177251

client/src/hooks/Input/useSpeechToTextExternal.ts

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ const useSpeechToTextExternal = (
2121
const [isListening, setIsListening] = useState(false);
2222
const [audioChunks, setAudioChunks] = useState<Blob[]>([]);
2323
const [isRequestBeingMade, setIsRequestBeingMade] = useState(false);
24+
const [audioMimeType, setAudioMimeType] = useState<string>('audio/webm');
2425

2526
const [minDecibels] = useRecoilState(store.decibelValue);
2627
const [autoSendText] = useRecoilState(store.autoSendText);
@@ -48,6 +49,44 @@ const useSpeechToTextExternal = (
4849
},
4950
});
5051

52+
const getBestSupportedMimeType = () => {
53+
const types = [
54+
'audio/webm',
55+
'audio/webm;codecs=opus',
56+
'audio/mp4',
57+
'audio/ogg;codecs=opus',
58+
'audio/ogg',
59+
'audio/wav',
60+
];
61+
62+
for (const type of types) {
63+
if (MediaRecorder.isTypeSupported(type)) {
64+
return type;
65+
}
66+
}
67+
68+
const ua = navigator.userAgent.toLowerCase();
69+
if (ua.indexOf('safari') !== -1 && ua.indexOf('chrome') === -1) {
70+
return 'audio/mp4';
71+
} else if (ua.indexOf('firefox') !== -1) {
72+
return 'audio/ogg';
73+
} else {
74+
return 'audio/webm';
75+
}
76+
};
77+
78+
const getFileExtension = (mimeType: string) => {
79+
if (mimeType.includes('mp4')) {
80+
return 'm4a';
81+
} else if (mimeType.includes('ogg')) {
82+
return 'ogg';
83+
} else if (mimeType.includes('wav')) {
84+
return 'wav';
85+
} else {
86+
return 'webm';
87+
}
88+
};
89+
5190
const cleanup = () => {
5291
if (mediaRecorderRef.current) {
5392
mediaRecorderRef.current.removeEventListener('dataavailable', (event: BlobEvent) => {
@@ -73,12 +112,13 @@ const useSpeechToTextExternal = (
73112

74113
const handleStop = () => {
75114
if (audioChunks.length > 0) {
76-
const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
115+
const audioBlob = new Blob(audioChunks, { type: audioMimeType });
116+
const fileExtension = getFileExtension(audioMimeType);
77117

78118
setAudioChunks([]);
79119

80120
const formData = new FormData();
81-
formData.append('audio', audioBlob, 'audio.wav');
121+
formData.append('audio', audioBlob, `audio.${fileExtension}`);
82122
setIsRequestBeingMade(true);
83123
cleanup();
84124
processAudio(formData);
@@ -133,7 +173,12 @@ const useSpeechToTextExternal = (
133173
if (audioStream.current) {
134174
try {
135175
setAudioChunks([]);
136-
mediaRecorderRef.current = new MediaRecorder(audioStream.current);
176+
const bestMimeType = getBestSupportedMimeType();
177+
setAudioMimeType(bestMimeType);
178+
179+
mediaRecorderRef.current = new MediaRecorder(audioStream.current, {
180+
mimeType: bestMimeType,
181+
});
137182
mediaRecorderRef.current.addEventListener('dataavailable', (event: BlobEvent) => {
138183
audioChunks.push(event.data);
139184
});
@@ -221,7 +266,7 @@ const useSpeechToTextExternal = (
221266
return () => {
222267
window.removeEventListener('keydown', handleKeyDown);
223268
};
224-
// eslint-disable-next-line react-hooks/exhaustive-deps
269+
225270
}, [isListening]);
226271

227272
return {

0 commit comments

Comments
 (0)