1
0

chat.ts 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847
  1. import { Queue } from "@/utils/queue"
  2. import { Message, Role, Screenplay, Talk, textsToScreenplay } from "./messages";
  3. import { Viewer } from "@/features/vrmViewer/viewer";
  4. import { getEchoChatResponseStream } from "./echoChat";
  5. import {
  6. getOpenAiChatResponseStream,
  7. getOpenAiVisionChatResponse,
  8. } from "./openAiChat";
  9. import {
  10. getLlamaCppChatResponseStream,
  11. getLlavaCppChatResponse,
  12. } from "./llamaCppChat";
  13. import { getWindowAiChatResponseStream } from "./windowAiChat";
  14. import {
  15. getOllamaChatResponseStream,
  16. getOllamaVisionChatResponse,
  17. } from "./ollamaChat";
  18. import { getKoboldAiChatResponseStream } from "./koboldAiChat";
  19. import { rvc } from "@/features/rvc/rvc";
  20. import { coquiLocal } from "@/features/coquiLocal/coquiLocal";
  21. import { piper } from "@/features/piper/piper";
  22. import { elevenlabs } from "@/features/elevenlabs/elevenlabs";
  23. import { speecht5 } from "@/features/speecht5/speecht5";
  24. import { openaiTTS } from "@/features/openaiTTS/openaiTTS";
  25. import { localXTTSTTS } from "@/features/localXTTS/localXTTS";
  26. import { AmicaLife } from "@/features/amicaLife/amicaLife";
  27. import { cleanTalk } from "@/utils/cleanTalk";
  28. import { processResponse } from "@/utils/processResponse";
  29. import { wait } from "@/utils/wait";
  30. import {
  31. isCharacterIdle,
  32. characterIdleTime,
  33. resetIdleTimer,
  34. } from "@/utils/isIdle";
  35. import { getOpenRouterChatResponseStream } from "./openRouterChat";
  36. import {
  37. AmicaLifeParams,
  38. ChatbotBackend,
  39. RVC,
  40. STTBackend,
  41. TTSBackend,
  42. VisionBackend,
  43. } from "@/types/backend";
  44. import { loadAudioAsFloat32Array } from "../diagnosed/sttDiagnosis";
  45. import { WaveFile } from "wavefile";
  46. import { openaiWhisper } from "../openaiWhisper/openaiWhisper";
  47. import { Transcriber } from "@/hooks/useTranscriber";
  48. import { whispercpp } from "../whispercpp/whispercpp";
  49. type Speak = {
  50. audioBuffer: ArrayBuffer | null;
  51. screenplay: Screenplay;
  52. streamIdx: number;
  53. };
  54. type TTSJob = {
  55. screenplay: Screenplay;
  56. streamIdx: number;
  57. };
  58. export interface ChatConfig {
  59. name: string;
  60. tts_backend:
  61. | "none"
  62. | "piper"
  63. | "coqui"
  64. | "elevenlabs"
  65. | "speecht5"
  66. | "openai_tts"
  67. | "localXTTS"
  68. | "coquiLocal";
  69. chatbot_backend:
  70. | "openai"
  71. | "llamacpp"
  72. | "ollama"
  73. | "koboldai"
  74. | "windowai"
  75. | "openrouter";
  76. stt_backend: "whisper_browser" | "whisper_openai" | "whispercpp";
  77. vision_backend: "vision_llamacpp" | "vision_ollama" | "vision_openai";
  78. system_prompt: string;
  79. vision_system_prompt: string;
  80. chatbot_params: ChatbotBackend;
  81. tts_params: TTSBackend;
  82. stt_params: STTBackend;
  83. vision_params: VisionBackend;
  84. amica_life_params: AmicaLifeParams;
  85. rvc_params: RVC;
  86. // Add more as needed
  87. }
  88. export class Chat {
  89. public initialized: boolean;
  90. private shouldStopProcessing = false;
  91. public transcriber?: Transcriber;
  92. public amicaLife?: AmicaLife;
  93. public viewer?: Viewer;
  94. public setChatLog?: (messageLog: Message[]) => void;
  95. public setUserMessage?: (message: string) => void;
  96. public setAssistantMessage?: (message: string) => void;
  97. public setShownMessage?: (role: Role) => void;
  98. public setChatProcessing?: (processing: boolean) => void;
  99. public setChatSpeaking?: (speaking: boolean) => void;
  100. public setWhisperCppOutput?: (output: any) => void;
  101. public setWhisperOpenAIOutput?: (output: any) => void;
  102. // the message from the user that is currently being processed
  103. // it can be reset
  104. public stream: ReadableStream<Uint8Array> | null;
  105. public streams: ReadableStream<Uint8Array>[];
  106. public reader: ReadableStreamDefaultReader<Uint8Array> | null;
  107. public readers: ReadableStreamDefaultReader<Uint8Array>[];
  108. // process these immediately as they come in and add to audioToPlay
  109. public ttsJobs: Queue<TTSJob>;
  110. // this should be read as soon as they exist
  111. // and then deleted from the queue
  112. public speakJobs: Queue<Speak>;
  113. private currentAssistantMessage: string;
  114. private currentUserMessage: string;
  115. private lastAwake: number;
  116. public messageList: Message[];
  117. public currentStreamIdx: number;
  118. public config?: ChatConfig;
  119. private onChatCompleteResolver?: () => void;
  120. public onChatComplete?: Promise<void>;
  121. constructor() {
  122. this.initialized = false;
  123. this.stream = null;
  124. this.reader = null;
  125. this.streams = [];
  126. this.readers = [];
  127. this.ttsJobs = new Queue<TTSJob>();
  128. this.speakJobs = new Queue<Speak>();
  129. this.currentAssistantMessage = "";
  130. this.currentUserMessage = "";
  131. this.messageList = [];
  132. this.currentStreamIdx = 0;
  133. this.lastAwake = 0;
  134. }
  135. public initialize(
  136. transcriber: Transcriber,
  137. amicaLife: AmicaLife,
  138. viewer: Viewer,
  139. setUserMessage: (message: string) => void,
  140. setAssistantMessage: (message: string) => void,
  141. setShownMessage: (role: Role) => void,
  142. setChatProcessing: (processing: boolean) => void,
  143. setChatSpeaking: (speaking: boolean) => void,
  144. setWhisperCppOutput: (output: any) => void,
  145. setWhisperOpenAIOutput: (output: any) => void,
  146. config: ChatConfig,
  147. ) {
  148. this.transcriber = transcriber;
  149. this.amicaLife = amicaLife;
  150. this.viewer = viewer;
  151. this.setUserMessage = setUserMessage;
  152. this.setAssistantMessage = setAssistantMessage;
  153. this.setShownMessage = setShownMessage;
  154. this.setChatProcessing = setChatProcessing;
  155. this.setChatSpeaking = setChatSpeaking;
  156. this.setWhisperCppOutput = setWhisperCppOutput;
  157. this.setWhisperOpenAIOutput = setWhisperOpenAIOutput;
  158. this.config = config;
  159. console.log("Config ", config)
  160. this.shouldStopProcessing = false;
  161. // these will run forever
  162. this.processTtsJobs();
  163. this.processSpeakJobs();
  164. this.updateAwake();
  165. this.initialized = true;
  166. }
  167. public setMessageList(messages: Message[]) {
  168. this.messageList = messages;
  169. this.currentAssistantMessage = "";
  170. this.currentUserMessage = "";
  171. // this.setChatLog!(this.messageList!);
  172. this.setAssistantMessage!(this.currentAssistantMessage);
  173. this.setUserMessage!(this.currentAssistantMessage);
  174. this.currentStreamIdx++;
  175. }
  176. public async handleRvc(audio: any) {
  177. const rvcParams = this.config?.rvc_params;
  178. const rvcUrl = rvcParams?.rvc_url!;
  179. const rvcModelName = rvcParams?.rvc_model_name!;
  180. const rvcIndexPath = rvcParams?.rvc_index_path!;
  181. const rvcF0upKey = parseInt(rvcParams?.rvc_f0_upkey!)!;
  182. const rvcF0Method = rvcParams?.rvc_f0_method!;
  183. const rvcIndexRate = rvcParams?.rvc_index_rate!;
  184. const rvcFilterRadius = parseInt(rvcParams?.rvc_filter_radius!);
  185. const rvcResampleSr = parseInt(rvcParams?.rvc_resample_sr!);
  186. const rvcRmsMixRate = parseInt(rvcParams?.rvc_rms_mix_rate!);
  187. const rvcProtect = parseInt(rvcParams?.rvc_protect!);
  188. const voice = await rvc(
  189. audio,
  190. rvcUrl,
  191. rvcModelName,
  192. rvcIndexPath,
  193. rvcF0upKey,
  194. rvcF0Method,
  195. rvcIndexRate,
  196. rvcFilterRadius,
  197. rvcResampleSr,
  198. rvcRmsMixRate,
  199. rvcProtect,
  200. );
  201. return voice.audio;
  202. }
  203. public idleTime(): number {
  204. return characterIdleTime(this.config?.amica_life_params.time_to_sleep_sec!,this.lastAwake);
  205. }
  206. public isAwake() {
  207. return !isCharacterIdle(this.config?.amica_life_params.time_to_sleep_sec!,this.lastAwake);
  208. }
  209. public updateAwake() {
  210. this.lastAwake = new Date().getTime();
  211. resetIdleTimer();
  212. }
  213. public async processTtsJobs() {
  214. while (!this.shouldStopProcessing) {
  215. do {
  216. if (this.shouldStopProcessing) return;
  217. const ttsJob = this.ttsJobs.dequeue();
  218. if (!ttsJob) break;
  219. if (ttsJob.streamIdx !== this.currentStreamIdx) {
  220. continue;
  221. }
  222. const audioBuffer = await this.fetchAudio(ttsJob.screenplay.talk);
  223. this.speakJobs.enqueue({
  224. audioBuffer,
  225. screenplay: ttsJob.screenplay,
  226. streamIdx: ttsJob.streamIdx,
  227. });
  228. } while (this.ttsJobs.size() > 0);
  229. await wait(50);
  230. }
  231. }
  232. public async processSpeakJobs() {
  233. while (!this.shouldStopProcessing) {
  234. do {
  235. if (this.shouldStopProcessing) return;
  236. const speak = this.speakJobs.dequeue();
  237. if (!speak) break;
  238. if (speak.streamIdx !== this.currentStreamIdx) continue;
  239. this.bubbleMessage("assistant", speak.screenplay.text);
  240. if (speak.audioBuffer) {
  241. this.setChatSpeaking!(true);
  242. await this.viewer!.model?.speak(speak.audioBuffer, speak.screenplay);
  243. this.setChatSpeaking!(false);
  244. if (this.isAwake()) this.updateAwake();
  245. }
  246. // Resolve full chat complete
  247. if (this.speakJobs.size() === 0 && this.ttsJobs.size() === 0) {
  248. this.onChatCompleteResolver?.();
  249. }
  250. } while (this.speakJobs.size() > 0);
  251. await wait(50);
  252. }
  253. }
  254. public async runFullInteraction(message: string, vision: boolean) {
  255. vision ? await this.getVisionResponse(message) : await this.receiveMessageFromUser(message,false);
  256. await wait(3000);
  257. await this.onChatComplete;
  258. }
  259. public bubbleMessage(role: Role, text: string) {
  260. // TODO: currentUser & Assistant message should be contain the message with emotion in it
  261. if (role === "user") {
  262. // add space if there is already a partial message
  263. if (this.currentUserMessage !== "") {
  264. this.currentUserMessage += " ";
  265. }
  266. this.currentUserMessage += text;
  267. this.setUserMessage!(this.currentUserMessage);
  268. this.setAssistantMessage!("");
  269. if (this.currentAssistantMessage !== "") {
  270. this.messageList!.push({
  271. role: "assistant",
  272. content: this.currentAssistantMessage,
  273. });
  274. this.currentAssistantMessage = "";
  275. }
  276. // this.setChatLog!([
  277. // ...this.messageList!,
  278. // { role: "user", content: this.currentUserMessage },
  279. // ]);
  280. }
  281. if (role === "assistant") {
  282. if (
  283. this.currentAssistantMessage != "" &&
  284. !this.isAwake() &&
  285. this.config?.amica_life_params.amica_life_enabled === "true"
  286. ) {
  287. this.messageList!.push({
  288. role: "assistant",
  289. content: this.currentAssistantMessage,
  290. });
  291. this.currentAssistantMessage = text;
  292. this.setAssistantMessage!(this.currentAssistantMessage);
  293. } else {
  294. this.currentAssistantMessage += text;
  295. this.setUserMessage!("");
  296. this.setAssistantMessage!(this.currentAssistantMessage);
  297. }
  298. if (this.currentUserMessage !== "") {
  299. this.messageList!.push({
  300. role: "user",
  301. content: this.currentUserMessage,
  302. });
  303. this.currentUserMessage = "";
  304. }
  305. // this.setChatLog!([
  306. // ...this.messageList!,
  307. // { role: "assistant", content: this.currentAssistantMessage },
  308. // ]);
  309. }
  310. this.setShownMessage!(role);
  311. console.debug("bubbler", this.messageList);
  312. }
  313. public async interrupt() {
  314. this.currentStreamIdx++;
  315. try {
  316. if (this.reader) {
  317. console.debug("cancelling");
  318. if (!this.reader?.closed) {
  319. await this.reader?.cancel();
  320. }
  321. // this.reader = null;
  322. // this.stream = null;
  323. console.debug("finished cancelling");
  324. }
  325. } catch (e: any) {
  326. console.error(e.toString());
  327. }
  328. // TODO if llm type is llama.cpp, we can send /stop message here
  329. this.ttsJobs.clear();
  330. this.speakJobs.clear();
  331. // TODO stop viewer from speaking
  332. }
  333. // this happens either from text or from voice / whisper completion
  334. public async receiveMessageFromUser(message: string, amicaLife: boolean) {
  335. if (message === null || message === "") {
  336. return;
  337. }
  338. this.onChatComplete = new Promise<void>((resolve) => {
  339. this.onChatCompleteResolver = resolve;
  340. });
  341. console.time("performance_interrupting");
  342. console.debug("interrupting...");
  343. await this.interrupt();
  344. console.timeEnd("performance_interrupting");
  345. await wait(0);
  346. console.debug("wait complete");
  347. if (!amicaLife) {
  348. console.log("receiveMessageFromUser", message);
  349. this.amicaLife?.receiveMessageFromUser(message);
  350. if (!/\[.*?\]/.test(message)) {
  351. message = `[neutral] ${message}`;
  352. }
  353. this.updateAwake();
  354. this.bubbleMessage("user", message);
  355. }
  356. // make new stream
  357. const messages: Message[] = [
  358. { role: "system", content: this.config?.system_prompt! },
  359. ...this.messageList!,
  360. { role: "user", content: amicaLife ? message : this.currentUserMessage },
  361. ];
  362. // console.debug('messages', messages);
  363. await this.makeAndHandleStream(messages);
  364. }
  365. public async makeAndHandleStream(messages: Message[]) {
  366. try {
  367. this.streams.push(await this.getChatResponseStream(messages));
  368. } catch (e: any) {
  369. const errMsg = e.toString();
  370. console.error("Failed to get chat response", errMsg);
  371. return errMsg;
  372. }
  373. if (this.streams[this.streams.length - 1] == null) {
  374. const errMsg = "Error: Null stream encountered.";
  375. console.error("Null stream encountered", errMsg);
  376. return errMsg;
  377. }
  378. return await this.handleChatResponseStream();
  379. }
  380. public async handleChatResponseStream() {
  381. if (this.streams.length === 0) {
  382. console.log("no stream!");
  383. return;
  384. }
  385. this.currentStreamIdx++;
  386. const streamIdx = this.currentStreamIdx;
  387. this.setChatProcessing!(true);
  388. console.time("chat stream processing");
  389. let reader = this.streams[this.streams.length - 1].getReader();
  390. this.readers.push(reader);
  391. let sentences = new Array<string>();
  392. let aiTextLog = "";
  393. let tag = "";
  394. let isThinking = false;
  395. let rolePlay = "";
  396. let receivedMessage = "";
  397. let firstTokenEncountered = false;
  398. let firstSentenceEncountered = false;
  399. console.time("performance_time_to_first_token");
  400. console.time("performance_time_to_first_sentence");
  401. try {
  402. while (true) {
  403. if (this.currentStreamIdx !== streamIdx) {
  404. console.log("wrong stream idx");
  405. break;
  406. }
  407. const { done, value } = await reader.read();
  408. if (!firstTokenEncountered) {
  409. console.timeEnd("performance_time_to_first_token");
  410. firstTokenEncountered = true;
  411. }
  412. if (done) break;
  413. receivedMessage += value;
  414. receivedMessage = receivedMessage.trimStart();
  415. const proc = processResponse({
  416. sentences,
  417. aiTextLog,
  418. receivedMessage,
  419. tag,
  420. isThinking,
  421. rolePlay,
  422. callback: (aiTalks: Screenplay[]): boolean => {
  423. // Generate & play audio for each sentence, display responses
  424. console.debug("enqueue tts", aiTalks);
  425. console.debug(
  426. "streamIdx",
  427. streamIdx,
  428. "currentStreamIdx",
  429. this.currentStreamIdx,
  430. );
  431. if (streamIdx !== this.currentStreamIdx) {
  432. console.log("wrong stream idx");
  433. return true; // should break
  434. }
  435. this.ttsJobs.enqueue({
  436. screenplay: aiTalks[0],
  437. streamIdx: streamIdx,
  438. });
  439. if (!firstSentenceEncountered) {
  440. console.timeEnd("performance_time_to_first_sentence");
  441. firstSentenceEncountered = true;
  442. }
  443. return false; // normal processing
  444. },
  445. });
  446. sentences = proc.sentences;
  447. aiTextLog = proc.aiTextLog;
  448. receivedMessage = proc.receivedMessage;
  449. tag = proc.tag;
  450. rolePlay = proc.rolePlay;
  451. if (proc.shouldBreak) {
  452. break;
  453. }
  454. }
  455. } catch (e: any) {
  456. const errMsg = e.toString();
  457. this.bubbleMessage!("assistant", errMsg);
  458. console.error(errMsg);
  459. } finally {
  460. if (!reader.closed) {
  461. reader.releaseLock();
  462. }
  463. console.timeEnd("chat stream processing");
  464. if (streamIdx === this.currentStreamIdx) {
  465. this.setChatProcessing!(false);
  466. }
  467. }
  468. return aiTextLog;
  469. }
  470. // TTS
  471. public async fetchAudio(talk: Talk): Promise<ArrayBuffer | null> {
  472. // TODO we should remove non-speakable characters
  473. // since this depends on the tts backend, we should do it
  474. // in their respective functions
  475. // this is just a simple solution for now
  476. talk = cleanTalk(talk);
  477. if (talk.message.trim() === "") {
  478. return null;
  479. }
  480. const params = this.config?.tts_params;
  481. const rvcParams = this.config?.rvc_params;
  482. const rvcEnabled = rvcParams?.rvc_enabled === "true";
  483. try {
  484. switch (this.config?.tts_backend) {
  485. case "none": {
  486. return null;
  487. }
  488. case "elevenlabs": {
  489. const p = params as TTSBackend["elevenlabs"]
  490. const voiceId = p?.elevenlabs_voiceid!;
  491. const voice = await elevenlabs(
  492. p,
  493. talk.message,
  494. voiceId,
  495. );
  496. if (rvcEnabled) {
  497. return await this.handleRvc(voice.audio);
  498. }
  499. return voice.audio;
  500. }
  501. case "speecht5": {
  502. const p = params as TTSBackend["speecht5"]
  503. const speakerEmbeddingUrl = p?.speecht5_speaker_embedding_url!;
  504. const voice = await speecht5(talk.message, speakerEmbeddingUrl);
  505. if (rvcEnabled) {
  506. return await this.handleRvc(voice.audio);
  507. }
  508. return voice.audio;
  509. }
  510. case "openai_tts": {
  511. const voice = await openaiTTS(params as TTSBackend["openai_tts"], talk.message);
  512. if (rvcEnabled) {
  513. return await this.handleRvc(voice.audio);
  514. }
  515. return voice.audio;
  516. }
  517. case "localXTTS": {
  518. const voice = await localXTTSTTS(params as TTSBackend["localXTTS"], talk.message);
  519. if (rvcEnabled) {
  520. return await this.handleRvc(voice.audio);
  521. }
  522. return voice.audio;
  523. }
  524. case "piper": {
  525. const voice = await piper(params as TTSBackend["piper"], talk.message);
  526. if (rvcEnabled) {
  527. return await this.handleRvc(voice.audio);
  528. }
  529. return voice.audio;
  530. }
  531. case "coquiLocal": {
  532. const voice = await coquiLocal(params as TTSBackend["coquiLocal"], talk.message);
  533. if (rvcEnabled) {
  534. return await this.handleRvc(voice.audio);
  535. }
  536. return voice.audio;
  537. }
  538. }
  539. } catch (e: any) {
  540. console.error("Failed to get TTS response", e.toString());
  541. }
  542. return null;
  543. }
  544. // Chatbot
  545. public async getChatResponseStream(messages: Message[]) {
  546. console.debug("getChatResponseStream", messages);
  547. const chatbotBackend = this.config?.chatbot_backend;
  548. const name = this.config?.name!;
  549. const system_prompt = this.config?.system_prompt!;
  550. const params = this.config?.chatbot_params;
  551. switch (chatbotBackend) {
  552. case "openai":
  553. return getOpenAiChatResponseStream(
  554. params as ChatbotBackend["openai"],
  555. messages,
  556. );
  557. case "llamacpp":
  558. return getLlamaCppChatResponseStream(
  559. params as ChatbotBackend["llamacpp"],
  560. name,
  561. system_prompt,
  562. messages,
  563. );
  564. case "windowai":
  565. return getWindowAiChatResponseStream(name, messages);
  566. case "ollama":
  567. return getOllamaChatResponseStream(
  568. params as ChatbotBackend["ollama"],
  569. messages,
  570. );
  571. case "koboldai":
  572. return getKoboldAiChatResponseStream(
  573. name,
  574. system_prompt,
  575. params as ChatbotBackend["koboldai"],
  576. messages,
  577. );
  578. case "openrouter":
  579. return getOpenRouterChatResponseStream(
  580. params as ChatbotBackend["openrouter"],
  581. messages,
  582. );
  583. }
  584. return getEchoChatResponseStream(messages);
  585. }
  586. // STT
  587. public async getSTTResponse() {
  588. let audio = await loadAudioAsFloat32Array("/sample-voice.wav");
  589. try {
  590. switch (this.config?.stt_backend) {
  591. case "whisper_browser": {
  592. console.debug("whisper_browser attempt");
  593. // since VAD sample rate is same as whisper we do nothing here
  594. // both are 16000
  595. const audioCtx = new AudioContext();
  596. const buffer = audioCtx.createBuffer(1, audio.length, 16000);
  597. buffer.copyToChannel(new Float32Array(audio), 0, 0);
  598. this.transcriber?.start(buffer);
  599. break;
  600. }
  601. case "whisper_openai": {
  602. console.debug("whisper_openai attempt");
  603. const wav = new WaveFile();
  604. wav.fromScratch(1, 16000, "32f", audio);
  605. const file = new File([new Uint8Array(wav.toBuffer())], "input.wav", { type: "audio/wav" });
  606. let prompt;
  607. // TODO load prompt if it exists
  608. (async () => {
  609. try {
  610. const transcript = await openaiWhisper(this.config?.stt_params.whisper_openai,file, prompt);
  611. this.setWhisperOpenAIOutput!(transcript);
  612. } catch (e: any) {
  613. console.error("whisper_openai error", e);
  614. }
  615. })();
  616. break;
  617. }
  618. case "whispercpp": {
  619. console.debug("whispercpp attempt");
  620. const wav = new WaveFile();
  621. wav.fromScratch(1, 16000, "32f", audio);
  622. wav.toBitDepth("16");
  623. const file = new File([new Uint8Array(wav.toBuffer())], "input.wav", { type: "audio/wav" });
  624. let prompt;
  625. // TODO load prompt if it exists
  626. (async () => {
  627. try {
  628. const transcript = await whispercpp(this.config?.stt_params.whispercpp,file, prompt);
  629. this.setWhisperCppOutput!(transcript);
  630. } catch (e: any) {
  631. console.error("whispercpp error", e);
  632. }
  633. })();
  634. break;
  635. }
  636. }
  637. } catch (e: any) {
  638. console.error("stt_backend error", e);
  639. }
  640. }
  641. // Vision
  642. public async getVisionResponse(imageData: string, onlyVisionResponse?: boolean) {
  643. try {
  644. const visionBackend = this.config?.vision_backend;
  645. const name = this.config?.name!;
  646. const vision_system_prompt = this.config?.vision_system_prompt!;
  647. const params = this.config?.vision_params;
  648. console.debug("vision_backend", visionBackend);
  649. this.onChatComplete = new Promise<void>((resolve) => {
  650. this.onChatCompleteResolver = resolve;
  651. });
  652. let res = "";
  653. if (visionBackend === "vision_llamacpp") {
  654. const messages: Message[] = [
  655. { role: "system", content: vision_system_prompt },
  656. ...this.messageList!,
  657. {
  658. role: "user",
  659. content: "Describe the image as accurately as possible",
  660. },
  661. ];
  662. res = await getLlavaCppChatResponse(
  663. name,
  664. vision_system_prompt,
  665. params as VisionBackend["vision_llamacpp"],
  666. messages,
  667. imageData,
  668. );
  669. } else if (visionBackend === "vision_ollama") {
  670. const messages: Message[] = [
  671. { role: "system", content: vision_system_prompt },
  672. ...this.messageList!,
  673. {
  674. role: "user",
  675. content: "Describe the image as accurately as possible",
  676. },
  677. ];
  678. res = await getOllamaVisionChatResponse(
  679. params as VisionBackend["vision_ollama"],
  680. messages,
  681. imageData,
  682. );
  683. } else if (visionBackend === "vision_openai") {
  684. const messages: Message[] = [
  685. { role: "user", content: vision_system_prompt },
  686. ...(this.messageList! as any[]),
  687. {
  688. role: "user",
  689. // @ts-ignore normally this is a string
  690. content: [
  691. {
  692. type: "text",
  693. text: "Describe the image as accurately as possible",
  694. },
  695. {
  696. type: "image_url",
  697. image_url: {
  698. url: `data:image/jpeg;base64,${imageData}`,
  699. },
  700. },
  701. ],
  702. },
  703. ];
  704. res = await getOpenAiVisionChatResponse(
  705. params as VisionBackend["vision_openai"],
  706. messages,
  707. );
  708. } else {
  709. console.warn("vision_backend not supported", visionBackend);
  710. return;
  711. }
  712. if (onlyVisionResponse) {
  713. return res;
  714. }
  715. await this.makeAndHandleStream([
  716. { role: "system", content: this.config?.system_prompt! },
  717. ...this.messageList!,
  718. {
  719. role: "user",
  720. content: `This is a picture I just took from my webcam (described between [[ and ]] ): [[${res}]] Please respond accordingly and as if it were just sent and as though you can see it.`,
  721. },
  722. ]);
  723. } catch (e: any) {
  724. console.error("Failed to get vision response ", e.toString());
  725. }
  726. }
  727. public clean() {
  728. console.log("Stopping all chat processes...");
  729. this.shouldStopProcessing = true;
  730. // Cancel any readers or streams if needed
  731. this.interrupt();
  732. // You could also optionally clear queues
  733. this.ttsJobs.clear();
  734. this.speakJobs.clear();
  735. this.initialized = false;
  736. }
  737. }