compute_type="int8" if device == "cpu" else "float32" model = WhisperModel(model_size, device=device, compute_type=compute_type) def run(): segments, info = model.transcribe("audio.wav", beam_size=5) print("Detected language '%s' with probability %f" % (info.language, info.language...
whisper_model_load: model size = 140.54 MB whisper_init_state: kv self size = 5.25 MB whisper_init_state: kv cross size = 17.58 MB system_info: n_threads = 4 / 16 | AVX = 1 | AVX2 = 1 | AVX512 = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = ...
def faster_whisper_test(): model_size = "large-v3" ## window cpu model = WhisperModel(model_size, device="cpu", compute_type="int8", cpu_threads=16) # window gpu # model = WhisperModel(model_size, device="cuda", compute_type="float16") segments, info = model.transcribe("test.mp...
假设你把日语视频的转录语言设为汉语,前 8 分钟 Whisper 会正确转录到中文,但 8 分钟后的转录字幕会一直重复,并与实际片段无关。 model --model指 Whisper 的转录模型,转录效果为 tiny < base < small < medium < large,默认使用 small。添加参数--model medium或--model large可以切换到更大的模型,但转录...
Model details Whisper is a Transformer based encoder-decoder model, also referred to as asequence-to-sequencemodel. It was trained on 680k hours of labelled speech data annotated using large-scale weak supervision. The models were trained on either English-only data or multilingual data. The Engl...
whisper_model_load: model size = 140.54 MB whisper_init_state: kv self size = 5.25 MB whisper_init_state: kv cross size = 17.58 MB whisper_full_with_state: auto-detected language: zh (p = 0.999260) New Segment: 00:00:00 ==> 00:00:04.0800000 : 曾经我以为这一辈子我都不会再遇到喜欢...
(curl-s"$request_url2$file_name")echo"Server$request_url2$file_nameresponse:$response2"|tee-a"$log_file"returnfifile_size=$(stat-c%s"$file_path")model="small"# 记录开始时间start_time=$(date+%s)# 执行whisper命令whisper"$file_path"--model"$model"--output_formatjson--output_dir"$...
metric=evaluate.load("wer")eval_dataloader=DataLoader(common_voice["test"],batch_size=8,collate_fn=data_collator)model.eval()forstep,batchinenumerate(tqdm(eval_dataloader)):withtorch.cuda.amp.autocast():withtorch.no_grad():generated_tokens=(model.generate(input_features=batch["input_feature...
import pipelinepipe = pipeline("automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, chunk_length_s=15, batch_size=4, generate_kwargs={"assistant_model": assistant_model}, torch_dtype=...
(curl -s "$request_url2$file_name") echo "Server $request_url2$file_name response: $response2" | tee -a "$log_file" return fi file_size=$(stat -c%s "$file_path") model="small" # 记录开始时间 start_time=$(date +%s) # 执行whisper命令 whisper "$file_path" --model "$model...