Skip to content

Commit c5ed341

Browse files
flaviutpionxzh
andauthored
feat: support multi-modal (#179)
* Implement exporting multi-modal inputs * Show dall-e results * chore: simplify type --------- Co-authored-by: Pionxzh <[email protected]>
1 parent 1e7f0fa commit c5ed341

File tree

4 files changed

+121
-65
lines changed

4 files changed

+121
-65
lines changed

packages/userscript/src/api.ts

Lines changed: 51 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,10 @@ interface MessageMeta {
6060
export type AuthorRole = 'system' | 'assistant' | 'user' | 'tool'
6161

6262
interface MultiModalInputImage {
63+
/**
64+
* hack: this come from the api in the form of 'file-service://file-base64', but we replace it
65+
* automatically in the api wrapper with a data uri
66+
*/
6367
asset_pointer: string
6468
content_type: 'image_asset_pointer' & (string & {})
6569
height: number
@@ -149,9 +153,20 @@ export interface ApiConversations {
149153
total: number
150154
}
151155

156+
interface ApiFileDownload {
157+
status: 'success'
158+
/** signed download url */
159+
download_url: string
160+
metadata: {}
161+
file_name: string
162+
/** iso8601 datetime string */
163+
creation_time: string
164+
}
165+
152166
const sessionApi = urlcat(baseUrl, '/api/auth/session')
153167
const conversationApi = (id: string) => urlcat(apiUrl, '/conversation/:id', { id })
154168
const conversationsApi = (offset: number, limit: number) => urlcat(apiUrl, '/conversations', { offset, limit })
169+
const fileDownloadApi = (id: string) => urlcat(apiUrl, '/files/:id/download', { id })
155170

156171
export async function getCurrentChatId(): Promise<string> {
157172
if (isSharePage()) {
@@ -169,22 +184,54 @@ export async function getCurrentChatId(): Promise<string> {
169184
throw new Error('No chat id found.')
170185
}
171186

187+
async function fetchImageFromPointer(uri: string) {
188+
const pointer = uri.replace('file-service://', '')
189+
const imageDetails = await fetchApi<ApiFileDownload>(fileDownloadApi(pointer))
190+
const image = await fetch(imageDetails.download_url)
191+
const blob = await image.blob()
192+
const base64 = await new Promise<string>((resolve, reject) => {
193+
const reader = new FileReader()
194+
reader.onerror = reject
195+
reader.onload = () => resolve(reader.result as string)
196+
reader.readAsDataURL(blob)
197+
})
198+
return base64.replace(/^data:.*?;/, `data:${image.headers.get('content-type')};`)
199+
}
200+
201+
/** replaces `file-service://` pointers with data uris containing the image */
202+
async function enhanceImageAssets(conversation: ApiConversationWithId): Promise<ApiConversationWithId> {
203+
const imageAssets = Object.values(conversation.mapping).flatMap((node) => {
204+
if (!node.message) return []
205+
if (node.message.content.content_type !== 'multimodal_text') return []
206+
return node.message.content.parts.filter(
207+
(part): part is MultiModalInputImage =>
208+
typeof part !== 'string' && part.asset_pointer.startsWith('file-service://'),
209+
)
210+
})
211+
212+
await Promise.all(imageAssets.map(async (asset) => {
213+
asset.asset_pointer = await fetchImageFromPointer(asset.asset_pointer)
214+
}))
215+
216+
return conversation
217+
}
218+
172219
export async function fetchConversation(chatId: string): Promise<ApiConversationWithId> {
173220
if (chatId.startsWith('__share__')) {
174221
const shareConversation = getConversationFromSharePage() as ApiConversation
175222
const id = chatId.replace('__share__', '')
176-
return {
223+
return enhanceImageAssets({
177224
id,
178225
...shareConversation,
179-
}
226+
})
180227
}
181228

182229
const url = conversationApi(chatId)
183230
const conversation = await fetchApi<ApiConversation>(url)
184-
return {
231+
return enhanceImageAssets({
185232
id: chatId,
186233
...conversation,
187-
}
234+
})
188235
}
189236

190237
async function fetchConversations(offset = 0, limit = 20): Promise<ApiConversations> {

packages/userscript/src/exporter/html.ts

Lines changed: 30 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -88,36 +88,37 @@ const transformAuthor = (author: ConversationNodeMessage['author']): string => {
8888
const transformContent = (
8989
content: ConversationNodeMessage['content'],
9090
metadata: ConversationNodeMessage['metadata'],
91+
postProcess: (input: string) => string = input => input,
9192
) => {
9293
switch (content.content_type) {
9394
case 'text':
94-
return content.parts?.join('\n') || ''
95+
return postProcess(content.parts?.join('\n') || '')
9596
case 'code':
96-
return `Code:\n\`\`\`\n${content.text}\n\`\`\`` || ''
97+
return postProcess(`Code:\n\`\`\`\n${content.text}\n\`\`\`` || '')
9798
case 'execution_output':
98-
return `Result:\n\`\`\`\n${content.text}\n\`\`\`` || ''
99+
return postProcess(`Result:\n\`\`\`\n${content.text}\n\`\`\`` || '')
99100
case 'tether_quote':
100-
return `> ${content.title || content.text || ''}`
101+
return postProcess(`> ${content.title || content.text || ''}`)
101102
case 'tether_browsing_code':
102-
return '' // TODO: implement
103+
return postProcess('') // TODO: implement
103104
case 'tether_browsing_display': {
104105
const metadataList = metadata?._cite_metadata?.metadata_list
105106
if (Array.isArray(metadataList) && metadataList.length > 0) {
106-
return metadataList.map(({ title, url }) => {
107+
return postProcess(metadataList.map(({ title, url }) => {
107108
return `> [${title}](${url})`
108-
}).join('\n')
109+
}).join('\n'))
109110
}
110-
return ''
111+
return postProcess('')
111112
}
112113
case 'multimodal_text': {
113114
return content.parts?.map((part) => {
114-
if (typeof part === 'string') return part
115-
if (part.asset_pointer) return `![image](${part.asset_pointer})`
116-
return '[Unsupported multimodal content]'
115+
if (typeof part === 'string') return postProcess(part)
116+
if (part.asset_pointer) return `<img src="${part.asset_pointer}" height="${part.height}" width="${part.width}" />`
117+
return postProcess('[Unsupported multimodal content]')
117118
}).join('\n') || ''
118119
}
119120
default:
120-
return '[Unsupported Content]'
121+
return postProcess('[Unsupported Content]')
121122
}
122123
}
123124

@@ -150,30 +151,31 @@ function conversationToHtml(conversation: ConversationResult, avatar: string, me
150151
if (!message || !message.content) return null
151152

152153
if (message.recipient !== 'all') return null // ChatGPT is talking to tool
153-
if (message.author.role === 'tool') return null // Skip tool's intermediate message
154+
// Skip tool's intermediate message.
155+
//
156+
// HACK: we special case the content_type 'multimodal_text' here because it is used by
157+
// the dall-e tool to return the image result, and we do want to show that.
158+
if (message.author.role === 'tool' && message.content.content_type !== 'multimodal_text') return null
154159

155-
const isUser = message.author.role === 'user'
156-
const isAssistant = message.author.role === 'assistant'
157160
const author = transformAuthor(message.author)
158161
const model = message?.metadata?.model_slug === 'gpt-4' ? 'GPT-4' : 'GPT-3'
159-
const authorType = isUser ? 'user' : model
160-
const avatarEl = isUser
162+
const authorType = message.author.role === 'user' ? 'user' : model
163+
const avatarEl = message.author.role === 'user'
161164
? `<img alt="${author}" />`
162165
: '<svg width="41" height="41"><use xlink:href="#chatgpt" /></svg>'
163-
let content = transformContent(message.content, message.metadata)
164-
if (isAssistant) {
165-
content = transformFootNotes(content, message.metadata)
166-
}
167-
168-
let conversationContent = content
169166

170-
if (isUser) {
171-
conversationContent = `<p>${escapeHtml(content)}</p>`
167+
let postSteps: Array<(input: string) => string> = []
168+
if (message.author.role === 'assistant') {
169+
postSteps = [...postSteps, input => transformFootNotes(input, message.metadata)]
170+
}
171+
if (message.author.role === 'user') {
172+
postSteps = [...postSteps, input => `<p>${escapeHtml(input)}</p>`]
172173
}
173174
else {
174-
const root = fromMarkdown(content)
175-
conversationContent = toHtml(root)
175+
postSteps = [...postSteps, input => toHtml(fromMarkdown(input))]
176176
}
177+
const postProcess = (input: string) => postSteps.reduce((acc, fn) => fn(acc), input)
178+
const content = transformContent(message.content, message.metadata, postProcess)
177179

178180
const timestamp = message?.create_time ?? ''
179181
const showTimestamp = enableTimestamp && timeStampHtml && timestamp
@@ -194,7 +196,7 @@ function conversationToHtml(conversation: ConversationResult, avatar: string, me
194196
</div>
195197
<div class="conversation-content-wrapper">
196198
<div class="conversation-content">
197-
${conversationContent}
199+
${content}
198200
</div>
199201
</div>
200202
${timestampHtml}

packages/userscript/src/exporter/markdown.ts

Lines changed: 24 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -83,36 +83,35 @@ const transformAuthor = (author: ConversationNodeMessage['author']): string => {
8383
const transformContent = (
8484
content: ConversationNodeMessage['content'],
8585
metadata: ConversationNodeMessage['metadata'],
86+
postProcess: (input: string) => string = input => input,
8687
) => {
8788
switch (content.content_type) {
8889
case 'text':
89-
return content.parts?.join('\n') || ''
90+
return postProcess(content.parts?.join('\n') || '')
9091
case 'code':
91-
return `Code:\n\`\`\`\n${content.text}\n\`\`\`` || ''
92+
return postProcess(`Code:\n\`\`\`\n${content.text}\n\`\`\`` || '')
9293
case 'execution_output':
93-
return `Result:\n\`\`\`\n${content.text}\n\`\`\`` || ''
94+
return postProcess(`Result:\n\`\`\`\n${content.text}\n\`\`\`` || '')
9495
case 'tether_quote':
95-
return `> ${content.title || content.text || ''}`
96+
return postProcess(`> ${content.title || content.text || ''}`)
9697
case 'tether_browsing_code':
97-
return '' // TODO: implement
98+
return postProcess('') // TODO: implement
9899
case 'tether_browsing_display': {
99100
const metadataList = metadata?._cite_metadata?.metadata_list
100101
if (Array.isArray(metadataList) && metadataList.length > 0) {
101-
return metadataList.map(({ title, url }) => {
102-
return `> [${title}](${url})`
103-
}).join('\n')
102+
return postProcess(metadataList.map(({ title, url }) => `> [${title}](${url})`).join('\n'))
104103
}
105-
return ''
104+
return postProcess('')
106105
}
107106
case 'multimodal_text': {
108107
return content.parts?.map((part) => {
109-
if (typeof part === 'string') return part
108+
if (typeof part === 'string') return postProcess(part)
110109
if (part.asset_pointer) return `![image](${part.asset_pointer})`
111-
return '[Unsupported multimodal content]'
110+
return postProcess('[Unsupported multimodal content]')
112111
}).join('\n') || ''
113112
}
114113
default:
115-
return '[Unsupported Content]'
114+
return postProcess('[Unsupported Content]')
116115
}
117116
}
118117

@@ -179,7 +178,11 @@ function conversationToMarkdown(conversation: ConversationResult, metaList?: Exp
179178
if (!message || !message.content) return null
180179

181180
if (message.recipient !== 'all') return null // ChatGPT is talking to tool
182-
if (message.author.role === 'tool') return null // Skip tool's intermediate message
181+
// Skip tool's intermediate message.
182+
//
183+
// HACK: we special case the content_type 'multimodal_text' here because it is used by
184+
// the dall-e tool to return the image result, and we do want to show that.
185+
if (message.author.role === 'tool' && message.content.content_type !== 'multimodal_text') return null
183186

184187
const timestamp = message?.create_time ?? ''
185188
const showTimestamp = enableTimestamp && timeStampHtml && timestamp
@@ -193,16 +196,17 @@ function conversationToMarkdown(conversation: ConversationResult, metaList?: Exp
193196

194197
const isUser = message.author.role === 'user'
195198
const author = transformAuthor(message.author)
196-
let content = transformContent(message.content, message.metadata)
199+
200+
let postSteps: Array<(input: string) => string> = []
197201
if (message.author.role === 'assistant') {
198-
content = transformFootNotes(content, message.metadata)
202+
postSteps = [...postSteps, input => transformFootNotes(input, message.metadata)]
199203
}
200-
201-
// User's message will not be reformatted
202-
if (!isUser && content) {
203-
const root = fromMarkdown(content)
204-
content = toMarkdown(root)
204+
if (!isUser) { // User's message will not be reformatted
205+
postSteps = [...postSteps, input => toMarkdown(fromMarkdown(input))]
205206
}
207+
const postProcess = (input: string) => postSteps.reduce((acc, fn) => fn(acc), input)
208+
const content = transformContent(message.content, message.metadata, postProcess)
209+
206210
return `#### ${author}:\n${timestampHtml}${content}`
207211
}).filter(Boolean).join('\n\n')
208212

packages/userscript/src/exporter/text.ts

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,36 +26,35 @@ const transformAuthor = (author: ConversationNodeMessage['author']): string => {
2626
const transformContent = (
2727
content: ConversationNodeMessage['content'],
2828
metadata: ConversationNodeMessage['metadata'],
29+
postProcess: (input: string) => string = input => input,
2930
) => {
3031
switch (content.content_type) {
3132
case 'text':
32-
return content.parts?.join('\n') || ''
33+
return postProcess(content.parts?.join('\n') || '')
3334
case 'code':
34-
return content.text || ''
35+
return postProcess(content.text || '')
3536
case 'execution_output':
36-
return content.text || ''
37+
return postProcess(content.text || '')
3738
case 'tether_quote':
38-
return `> ${content.title || content.text || ''}`
39+
return postProcess(`> ${content.title || content.text || ''}`)
3940
case 'tether_browsing_code':
40-
return '' // TODO: implement
41+
return postProcess('') // TODO: implement
4142
case 'tether_browsing_display': {
4243
const metadataList = metadata?._cite_metadata?.metadata_list
4344
if (Array.isArray(metadataList) && metadataList.length > 0) {
44-
return metadataList.map(({ title, url }) => {
45-
return `> [${title}](${url})`
46-
}).join('\n')
45+
return postProcess(metadataList.map(({ title, url }) => `> [${title}](${url})`).join('\n'))
4746
}
48-
return ''
47+
return postProcess('')
4948
}
5049
case 'multimodal_text': {
5150
return content.parts?.map((part) => {
52-
if (typeof part === 'string') return part
51+
if (typeof part === 'string') return postProcess(part)
5352
if (part.asset_pointer) return `![image](${part.asset_pointer})`
54-
return '[Unsupported multimodal content]'
53+
return postProcess('[Unsupported multimodal content]')
5554
}).join('\n') || ''
5655
}
5756
default:
58-
return '[Unsupported Content]'
57+
return postProcess('[Unsupported Content]')
5958
}
6059
}
6160

@@ -107,7 +106,11 @@ export async function exportToText() {
107106
if (!message || !message.content) return null
108107

109108
if (message.recipient !== 'all') return null // ChatGPT is talking to tool
110-
if (message.author.role === 'tool') return null // Skip tool's intermediate message
109+
// Skip tool's intermediate message.
110+
//
111+
// HACK: we special case the content_type 'multimodal_text' here because it is used by
112+
// the dall-e tool to return the image result, and we do want to show that.
113+
if (message.author.role === 'tool' && message.content.content_type !== 'multimodal_text') return null
111114

112115
const author = transformAuthor(message.author)
113116
let content = transformContent(message.content, message.metadata)

0 commit comments

Comments
 (0)