| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139 |
- """
- 语音识别服务 (ASR)
- 功能: 将音频文件转为文字
- 支持多种ASR引擎
- """
- from flask import Flask, request, jsonify
- from flask_cors import CORS
- import tempfile
- import os
- app = Flask(__name__)
- CORS(app)
- # 选择ASR引擎
- # 可选: 'faster-whisper', 'whisper', 'sherpa-onnx'
- ASR_ENGINE = os.getenv('ASR_ENGINE', 'faster-whisper')
- # 全局变量存储模型
- model = None
- def load_model():
- """
- 加载ASR模型
- """
- global model
- print(f'正在加载ASR模型 (引擎: {ASR_ENGINE})...')
- if ASR_ENGINE == 'faster-whisper':
- from faster_whisper import WhisperModel
- # 使用量化模型减少内存占用
- model = WhisperModel(
- "base",
- device="cpu",
- compute_type="int8"
- )
- print('Faster-Whisper模型加载成功')
- elif ASR_ENGINE == 'whisper':
- import whisper
- model = whisper.load_model("base")
- print('Whisper模型加载成功')
- else:
- print('使用Sherpa-ONNX需要单独部署,请参考文档')
- model = None
- def transcribe_with_faster_whisper(audio_path):
- """
- 使用Faster-Whisper进行语音识别
- """
- segments, info = model.transcribe(
- audio_path,
- language="zh",
- beam_size=5,
- vad_filter=True # 启用VAD过滤静音
- )
- text = "".join([segment.text for segment in segments])
- return text.strip()
- def transcribe_with_whisper(audio_path):
- """
- 使用OpenAI Whisper进行语音识别
- """
- result = model.transcribe(audio_path, language="zh", fp16=False)
- return result["text"].strip()
- @app.route('/transcribe', methods=['POST'])
- def transcribe():
- """
- 接口: 语音转文字
- 请求:
- - audio: 音频文件 (wav, mp3, m4a)
- 响应:
- {
- "text": "识别出的文字",
- "duration": 3.5 // 音频时长(秒)
- }
- """
- try:
- # 检查是否有文件上传
- if 'audio' not in request.files:
- return jsonify({'error': '未找到音频文件'}), 400
- audio_file = request.files['audio']
- # 保存到临时文件
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
- audio_file.save(tmp.name)
- audio_path = tmp.name
- try:
- # 进行语音识别
- if ASR_ENGINE == 'faster-whisper':
- text = transcribe_with_faster_whisper(audio_path)
- elif ASR_ENGINE == 'whisper':
- text = transcribe_with_whisper(audio_path)
- else:
- return jsonify({'error': f'不支持的ASR引擎: {ASR_ENGINE}'}), 400
- if not text:
- return jsonify({'error': '未能识别到语音'}), 400
- print(f'识别结果: {text}')
- return jsonify({'text': text})
- finally:
- # 删除临时文件
- if os.path.exists(audio_path):
- os.remove(audio_path)
- except Exception as e:
- print(f'ASR错误: {e}')
- return jsonify({'error': str(e)}), 500
- @app.route('/health', methods=['GET'])
- def health():
- """健康检查"""
- return jsonify({
- 'status': 'ok',
- 'engine': ASR_ENGINE,
- 'model_loaded': model is not None
- })
- if __name__ == '__main__':
- load_model()
- print('ASR服务启动中...')
- print('监听地址: http://0.0.0.0:5000')
- app.run(host='0.0.0.0', port=5000, debug=False)
|