WangKang
/
playwright_demo


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
							"""
语音识别服务 (ASR)
功能: 将音频文件转为文字
支持多种ASR引擎
"""

from flask import Flask, request, jsonify
from flask_cors import CORS
import tempfile
import os

app = Flask(__name__)
CORS(app)

# 选择ASR引擎
# 可选: 'faster-whisper', 'whisper', 'sherpa-onnx'
ASR_ENGINE = os.getenv('ASR_ENGINE', 'faster-whisper')

# 全局变量存储模型
model = None


def load_model():
    """
    加载ASR模型
    """
    global model

    print(f'正在加载ASR模型 (引擎: {ASR_ENGINE})...')

    if ASR_ENGINE == 'faster-whisper':
        from faster_whisper import WhisperModel
        # 使用量化模型减少内存占用
        model = WhisperModel(
            "base",
            device="cpu",
            compute_type="int8"
        )
        print('Faster-Whisper模型加载成功')

    elif ASR_ENGINE == 'whisper':
        import whisper
        model = whisper.load_model("base")
        print('Whisper模型加载成功')

    else:
        print('使用Sherpa-ONNX需要单独部署,请参考文档')
        model = None


def transcribe_with_faster_whisper(audio_path):
    """
    使用Faster-Whisper进行语音识别
    """
    segments, info = model.transcribe(
        audio_path,
        language="zh",
        beam_size=5,
        vad_filter=True  # 启用VAD过滤静音
    )

    text = "".join([segment.text for segment in segments])
    return text.strip()


def transcribe_with_whisper(audio_path):
    """
    使用OpenAI Whisper进行语音识别
    """
    result = model.transcribe(audio_path, language="zh", fp16=False)
    return result["text"].strip()


@app.route('/transcribe', methods=['POST'])
def transcribe():
    """
    接口: 语音转文字

    请求:
        - audio: 音频文件 (wav, mp3, m4a)

    响应:
        {
            "text": "识别出的文字",
            "duration": 3.5  // 音频时长(秒)
        }
    """
    try:
        # 检查是否有文件上传
        if 'audio' not in request.files:
            return jsonify({'error': '未找到音频文件'}), 400

        audio_file = request.files['audio']

        # 保存到临时文件
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
            audio_file.save(tmp.name)
            audio_path = tmp.name

        try:
            # 进行语音识别
            if ASR_ENGINE == 'faster-whisper':
                text = transcribe_with_faster_whisper(audio_path)
            elif ASR_ENGINE == 'whisper':
                text = transcribe_with_whisper(audio_path)
            else:
                return jsonify({'error': f'不支持的ASR引擎: {ASR_ENGINE}'}), 400

            if not text:
                return jsonify({'error': '未能识别到语音'}), 400

            print(f'识别结果: {text}')
            return jsonify({'text': text})

        finally:
            # 删除临时文件
            if os.path.exists(audio_path):
                os.remove(audio_path)

    except Exception as e:
        print(f'ASR错误: {e}')
        return jsonify({'error': str(e)}), 500


@app.route('/health', methods=['GET'])
def health():
    """健康检查"""
    return jsonify({
        'status': 'ok',
        'engine': ASR_ENGINE,
        'model_loaded': model is not None
    })


if __name__ == '__main__':
    load_model()
    print('ASR服务启动中...')
    print('监听地址: http://0.0.0.0:5000')
    app.run(host='0.0.0.0', port=5000, debug=False)