| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165 |
- /* eslint-disable camelcase */
- import { pipeline, env } from "@xenova/transformers";
- // Disable local models
- env.allowLocalModels = false;
- // Define model factories
- // Ensures only one model is created of each type
- class PipelineFactory {
- static task = null;
- static model = null;
- static quantized = null;
- static instance = null;
- constructor(tokenizer, model, quantized) {
- this.tokenizer = tokenizer;
- this.model = model;
- this.quantized = quantized;
- }
- static async getInstance(progress_callback = null) {
- if (this.instance === null) {
- this.instance = pipeline(this.task, this.model, {
- quantized: this.quantized,
- progress_callback,
- });
- }
- return this.instance;
- }
- }
- self.addEventListener("message", async (event) => {
- const message = event.data;
- // Do some work...
- // TODO use message data
- let transcript = await transcribe(message.audio);
- if (transcript === null) return;
- // Send the result back to the main thread
- self.postMessage({
- status: "complete",
- task: "automatic-speech-recognition",
- data: transcript,
- });
- });
- class AutomaticSpeechRecognitionPipelineFactory extends PipelineFactory {
- static task = "automatic-speech-recognition";
- // TODO load this from config
- static model = "Xenova/whisper-tiny.en";
- // static model = "distil-whisper/distil-medium.en";
- static quantized = true;
- }
- const transcribe = async (audio) => {
- // TODO use subtask and language
- // TODO load from config
- const p = AutomaticSpeechRecognitionPipelineFactory;
- /*
- * TODO invalidate model if different
- * check p.model !== modelName || p.quantized !== quantized) {
- // Invalidate model if different
- p.model = modelName;
- p.quantized = quantized;
- if (p.instance !== null) {
- (await p.getInstance()).dispose();
- p.instance = null;
- }
- }
- */
- // Load transcriber model
- let transcriber = await p.getInstance((data) => {
- self.postMessage(data);
- });
- const time_precision =
- transcriber.processor.feature_extractor.config.chunk_length /
- transcriber.model.config.max_source_positions;
- // Storage for chunks to be processed. Initialise with an empty chunk.
- let chunks_to_process = [
- {
- tokens: [],
- finalised: false,
- },
- ];
- // TODO: Storage for fully-processed and merged chunks
- // let decoded_chunks = [];
- function chunk_callback(chunk) {
- let last = chunks_to_process[chunks_to_process.length - 1];
- // Overwrite last chunk with new info
- Object.assign(last, chunk);
- last.finalised = true;
- // Create an empty chunk after, if it not the last chunk
- if (!chunk.is_last) {
- chunks_to_process.push({
- tokens: [],
- finalised: false,
- });
- }
- }
- // Inject custom callback function to handle merging of chunks
- function callback_function(item) {
- let last = chunks_to_process[chunks_to_process.length - 1];
- // Update tokens of last chunk
- last.tokens = [...item[0].output_token_ids];
- // Merge text chunks
- // TODO optimise so we don't have to decode all chunks every time
- let data = transcriber.tokenizer._decode_asr(chunks_to_process, {
- time_precision: time_precision,
- return_timestamps: true,
- force_full_sequences: false,
- });
- self.postMessage({
- status: "update",
- task: "automatic-speech-recognition",
- data: data,
- });
- }
- // Actually run transcription
- let output = await transcriber(audio, {
- // Greedy
- top_k: 0,
- do_sample: false,
- // Sliding window
- chunk_length_s: 30,
- stride_length_s: 5,
- // Language and task
- language: null,
- task: null,
- // Return timestamps
- return_timestamps: true,
- force_full_sequences: false,
- // Callback functions
- callback_function: callback_function, // after each generation step
- chunk_callback: chunk_callback, // after each chunk is processed
- }).catch((error) => {
- self.postMessage({
- status: "error",
- task: "automatic-speech-recognition",
- data: error,
- });
- return null;
- });
- return output;
- };
|