feat: api

2026-02-10 17:56:37 +08:00
parent d9ba359ddf
commit 7879751126
8 changed files with 550 additions and 2 deletions
--- a/.github/workflows/ci-cd.yml
+++ b/.github/workflows/ci-cd.yml
@ -0,0 +1,33 @@
 name: Build container
 env:
  VERSION: 0.0.1
  REGISTRY: https://harbor.bwgdi.com
  REGISTRY_NAME: harbor.bwgdi.com
  REGISTRY_PATH: library
  DOCKER_NAME: fun-asr
 on:
  push:
    branches:
      - main
  workflow_dispatch:
 jobs:
  build-docker:
    runs-on: builder-ubuntu-latest
    steps:
      - name: Checkout
        uses: actions/checkout@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          registry: ${{ env.REGISTRY }}
          username: ${{ secrets.BWGDI_NAME }}
          password: ${{ secrets.BWGDI_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v2
      - name: Build and push
        uses: docker/build-push-action@v4
        with:
          context: .
          file: ./Dockerfile
          push: true
          tags: ${{ env.REGISTRY_NAME }}/${{ env.REGISTRY_PATH }}/${{ env.DOCKER_NAME }}:${{ env.VERSION }}
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,38 @@
 # Python-generated files
 __pycache__/
 *.py[cod]
 *$py.class
 # Distribution / packaging
 build/
 dist/
 wheels/
 *.egg-info/
 # Unit test / coverage reports
 .pytest_cache/
 .coverage
 htmlcov/
 coverage.xml
 # Logs
 *.log
 log/*.log
 # Virtual environments
 .venv/
 venv/
 env/
 # IDE settings
 .vscode/
 .idea/
 # OS generated files
 .DS_Store
 # Generated files
 *.wav
 *.pdf
 *.lock
--- a/21
+++ b/21
@ -0,0 +1,21 @@
 FROM python:3.12-slim
 RUN apt-get update && apt-get -y install \
    ffmpeg \
    && rm -rf /var/lib/apt/lists/*
 COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
 # Create app directory
 WORKDIR /app
 # Copy dependency definition files
 COPY pyproject.toml uv.lock ./
 # Install dependencies
 ENV UV_HTTP_TIMEOUT=1200
 RUN uv sync
 # Copy the rest of the application
 COPY . .
 EXPOSE 5000
 CMD [ "uv", "run", "api.py" ]
--- a/api.py
+++ b/api.py
@ -0,0 +1,133 @@
 import os
 import shutil
 import uuid
 import torch
 import numpy as np
 import soundfile as sf
 from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 # 导入两种模式需要的库
 from funasr import AutoModel
 from model import FunASRNano
 from tools.utils import load_audio
 app = FastAPI(title="FunASR Dual-Mode API")
 # --- 环境配置 ---
 device = (
    "cuda:0" if torch.cuda.is_available()
    else "mps" if torch.backends.mps.is_available()
    else "cpu"
 )
 MODEL_DIR = os.getenv("MODEL_DIR", "/models/Fun-ASR-Nano-2512")
 TEMP_DIR = "./temp_audio"
 os.makedirs(TEMP_DIR, exist_ok=True)
 # --- 模型全局初始化 ---
 print(f"正在加载 AutoModel (Mode 1)...")
 model_auto = AutoModel(
    model=MODEL_DIR,
    trust_remote_code=True,
    vad_model="fsmn-vad",
    vad_kwargs={"max_single_segment_time": 30000},
    device=device,
    hub="ms"
 )
 print(f"正在加载 Direct Model (Mode 2)...")
 model_direct, direct_kwargs = FunASRNano.from_pretrained(model=MODEL_DIR, device=device)
 tokenizer = direct_kwargs.get("tokenizer", None)
 model_direct.eval()
 # --- 接口 1: Using FunASR for Inference ---
@app.post("/inference/funasr")
 async def inference_funasr(
    file: UploadFile = File(...),
    language: str = Form("中文"),
    itn: str = Form("true"),
    hotwords: str = Form("")
 ):
    temp_path = save_temp_file(file)
    try:
        is_itn = True if itn.lower() in ["true", "1", "t"] else False
        clean_lang = language.strip().strip('"')
        clean_hw = hotwords.strip().strip('"')
        # 核心修复点：不传 cache，且处理 hotwords
        res = model_auto.generate(
            input=temp_path,
            batch_size=1,
            hotwords=clean_hw if clean_hw else None,
            language=clean_lang,
            itn=is_itn,
        )
        return {"status": "success", "text": res[0]["text"]}
    except Exception as e:
        import traceback
        traceback.print_exc() 
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        remove_temp_file(temp_path)
 # --- 接口 2: Direct Inference ---
@app.post("/inference/direct")
 async def inference_direct(
    file: UploadFile = File(...),
    chunk_mode: bool = Form(False)  # 是否开启你脚本2中的分片逻辑
 ):
    """直接调用 model.py 中的 FunASRNano 进行推理"""
    temp_path = save_temp_file(file)
    try:
        if not chunk_mode:
            # 模式 A: 标准直接推理
            res = model_direct.inference(data_in=[temp_path], **direct_kwargs)
            text = res[0][0]
        else:
            # 模式 B: 模拟脚本 2 中的分片循环逻辑
            duration = sf.info(temp_path).duration
            chunk_size = 0.72
            cum_durations = np.arange(chunk_size, duration + chunk_size, chunk_size)
            prev_text = ""
            for idx, cum_duration in enumerate(cum_durations):
                audio, rate = load_audio(temp_path, 16000, duration=round(cum_duration, 3))
                # 注意：这里调用的是模型内部的推理逻辑
                step_res = model_direct.inference(
                    [torch.tensor(audio).to(device)], 
                    prev_text=prev_text, 
                    **direct_kwargs
                )
                prev_text = step_res[0][0]["text"]
                # 脚本 2 中的特殊解码逻辑
                if idx != len(cum_durations) - 1 and tokenizer:
                    prev_text = tokenizer.decode(tokenizer.encode(prev_text)[:-5]).replace("", "")
            text = prev_text
        return {"status": "success", "mode": "direct", "text": text}
    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))
    finally:
        remove_temp_file(temp_path)
 # --- 工具函数 ---
 def save_temp_file(upload_file):
    ext = os.path.splitext(upload_file.filename)[1]
    path = os.path.join(TEMP_DIR, f"{uuid.uuid4()}{ext}")
    with open(path, "wb") as buffer:
        shutil.copyfileobj(upload_file.file, buffer)
    return path
 def remove_temp_file(path):
    if os.path.exists(path):
        os.remove(path)
 if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=5000)
--- a/pyproject.toml
+++ b/pyproject.toml
@ -0,0 +1,27 @@
 [project]
 name = "Fun-ASR"
 version = "0.1.0"
 description = "语音识别/处理相关项目"
 readme = "README.md"
 requires-python = ">=3.12,<3.13"
 dependencies = [
    "torch>=2.9.0",
    "torchaudio>=2.9.0",
    "transformers>=4.51.3",
    "funasr>=1.3.0",
    "zhconv",
    "whisper_normalizer",
    "pyopenjtalk-plus",
    "compute-wer",
    "openai-whisper",
    "python-multipart==0.0.20",
    "fastapi>=0.128.0",
    "uvicorn>=0.40.0",
 ]
 [tool.uv]
 package = false # 声明这只是一个应用程序，不是一个库
 [build-system]
 requires = ["hatchling"]
 build-backend = "hatchling.build"
--- a/readme_bw.md
+++ b/readme_bw.md
@ -0,0 +1,147 @@
 # FunASR Dual-Mode API
 This is a speech recognition (ASR) service built on FastAPI, integrating two inference modes of FunASR to provide flexible speech transcription capabilities.
 ## Features
 The service provides two main inference interfaces:
 1.  **AutoModel Mode (`/inference/funasr`)**:
    *   Uses the `funasr.AutoModel` high-level interface.
    *   Integrates VAD (Voice Activity Detection).
    *   Supports Hotwords enhancement.
    *   Supports ITN (Inverse Text Normalization).
    *   Supports multi-language configuration.
 2.  **Direct Model Mode (`/inference/direct`)**:
    *   Directly calls the underlying `FunASRNano` model.
    *   Supports standard full inference.
    *   Supports simulated streaming/chunk inference (Chunk Mode) for testing the model's incremental decoding capabilities.
 ## Environment Setup
 ### Dependency Installation
 This project uses `uv` for dependency management. Please ensure `uv` is installed, then run the following command in the project root directory:
 ```bash
 uv sync
 ```
 ### Model Configuration
 The default model path is configured as `/models/Fun-ASR-Nano-2512`. If your model is located elsewhere, please set the environment variable `MODEL_DIR`:
 ```bash
 export MODEL_DIR="/your/absolute/path/to/model"
 ```
 ## Start Service
 You can start the service directly using the uv script (default port 5000):
 ```bash
 uv run api.py
 ```
 The service will automatically detect the computing device (CUDA > MPS > CPU) upon startup.
 ### Docker Startup
 If deploying with Docker, you can refer to the following command. You can specify a custom model path using `-e MODEL_DIR`:
 ```bash
 docker run -d --restart always -p 5000:5000 --gpus "device=1" \
  -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \
  --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \
  harbor.bwgdi.com/library/fun-asr:0.0.1
 ```
 ## API Documentation
 ### 1. FunASR Standard Inference Interface
 *   **URL**: `/inference/funasr`
 *   **Method**: `POST`
 *   **Content-Type**: `multipart/form-data`
 | Parameter Name | Type | Required | Default | Description |
 | :--- | :--- | :--- | :--- | :--- |
 | `file` | File | Yes | - | Audio file |
 | `language` | String | No | "中文" | Target language |
 | `itn` | String | No | "true" | Whether to enable Inverse Text Normalization (true/false) |
 | `hotwords` | String | No | "" | List of hotwords to improve recognition rate of specific vocabulary |
 **Example**:
 ```bash
 curl -X POST "http://127.0.0.1:5000/inference/funasr" \
  -F "file=@/path/to/audio.wav" \
  -F "hotwords=开放时间"
 ```
 ### 2. Direct Underlying Inference Interface
 *   **URL**: `/inference/direct`
 *   **Method**: `POST`
 *   **Content-Type**: `multipart/form-data`
 | Parameter Name | Type | Required | Default | Description |
 | :--- | :--- | :--- | :--- | :--- |
 | `file` | File | Yes | - | Audio file |
 | `chunk_mode` | Boolean | No | False | Whether to enable chunk simulation mode (true/false) |
 **Example**:
 ```bash
 # Enable chunk simulation mode
 curl -X POST "http://127.0.0.1:5000/inference/direct" \
  -F "file=@/path/to/audio.wav" \
  -F "chunk_mode=true"
 ```
 **Response**:
 ```json
 {
    "status": "success",
    "mode": "direct",
    "text": {
        "key": "rand_key_WgNZq6ITZM5jt",
        "text": "你好。",
        "text_tn": "你好",
        "label": "null",
        "ctc_text": "你好",
        "ctc_timestamps": [
            {
                "token": "你",
                "start_time": 1.8,
                "end_time": 1.86,
                "score": 0.908
            },
            {
                "token": "好",
                "start_time": 2.16,
                "end_time": 2.22,
                "score": 0.988
            }
        ],
        "timestamps": [
            {
                "token": "你",
                "start_time": 1.8,
                "end_time": 1.86,
                "score": 0.908
            },
            {
                "token": "好",
                "start_time": 2.16,
                "end_time": 2.22,
                "score": 0.988
            },
            {
                "token": "。",
                "start_time": 2.88,
                "end_time": 2.94,
                "score": 0.0
            }
        ]
    }
 }
 ```
--- a/readme_bw_zh.md
+++ b/readme_bw_zh.md
@ -0,0 +1,147 @@
 # FunASR Dual-Mode API
 这是一个基于 FastAPI 构建的语音识别（ASR）服务，集成了 FunASR 的两种推理模式，旨在提供灵活的语音转写能力。
 ## 功能特性
 服务提供了两个主要的推理接口：
 1.  **AutoModel 模式 (`/inference/funasr`)**:
    *   使用 `funasr.AutoModel` 高级接口。
    *   集成 VAD（语音活动检测）。
    *   支持热词（Hotwords）增强。
    *   支持 ITN（逆文本标准化）。
    *   支持多语言配置。
 2.  **Direct Model 模式 (`/inference/direct`)**:
    *   直接调用底层 `FunASRNano` 模型。
    *   支持普通全量推理。
    *   支持模拟流式/分片推理（Chunk Mode），用于测试模型的增量解码能力。
 ## 环境准备
 ### 依赖安装
 本项目使用 `uv` 进行依赖管理。请确保已安装 `uv`，然后在项目根目录下运行：
 ```bash
 uv sync
 ```
 ### 模型配置
 默认模型路径配置为 `/models/Fun-ASR-Nano-2512`。如果你的模型在其他位置，请设置环境变量 `MODEL_DIR`：
 ```bash
 export MODEL_DIR="/你的/模型/绝对路径"
 ```
 ## 启动服务
 可以直接运行 uv 脚本启动（默认端口 5000）：
 ```bash
 uv run api.py
 ```
 服务启动时会自动检测计算设备（CUDA > MPS > CPU）。
 ### Docker 启动
 若使用 Docker 部署，可参考以下命令。如需自定义模型路径，可通过 `-e MODEL_DIR` 指定：
 ```bash
 docker run -d --restart always -p 5000:5000 --gpus "device=1" \
  -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \
  --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \
  harbor.bwgdi.com/library/fun-asr:0.0.1
 ```
 ## 接口文档
 ### 1. FunASR 标准推理接口
 *   **URL**: `/inference/funasr`
 *   **Method**: `POST`
 *   **Content-Type**: `multipart/form-data`
 | 参数名 | 类型 | 必填 | 默认值 | 说明 |
 | :--- | :--- | :--- | :--- | :--- |
 | `file` | File | 是 | - | 音频文件 |
 | `language` | String | 否 | "中文" | 目标语言 |
 | `itn` | String | 否 | "true" | 是否开启逆文本标准化 (true/false) |
 | `hotwords` | String | 否 | "" | 热词列表，用于提升特定词汇识别率 |
 **示例**:
 ```bash
 curl -X POST "http://127.0.0.1:5000/inference/funasr" \
  -F "file=@/path/to/audio.wav" \
  -F "hotwords=开放时间"
 ```
 ### 2. Direct 底层推理接口
 *   **URL**: `/inference/direct`
 *   **Method**: `POST`
 *   **Content-Type**: `multipart/form-data`
 | 参数名 | 类型 | 必填 | 默认值 | 说明 |
 | :--- | :--- | :--- | :--- | :--- |
 | `file` | File | 是 | - | 音频文件 |
 | `chunk_mode` | Boolean | 否 | False | 是否开启分片模拟模式 (true/false) |
 **示例**:
 ```bash
 # 开启分片模拟模式
 curl -X POST "http://127.0.0.1:5000/inference/direct" \
  -F "file=@/path/to/audio.wav" \
  -F "chunk_mode=true"
 ```
 **返回**:
 ```json
 {
    "status": "success",
    "mode": "direct",
    "text": {
        "key": "rand_key_WgNZq6ITZM5jt",
        "text": "你好。",
        "text_tn": "你好",
        "label": "null",
        "ctc_text": "你好",
        "ctc_timestamps": [
            {
                "token": "你",
                "start_time": 1.8,
                "end_time": 1.86,
                "score": 0.908
            },
            {
                "token": "好",
                "start_time": 2.16,
                "end_time": 2.22,
                "score": 0.988
            }
        ],
        "timestamps": [
            {
                "token": "你",
                "start_time": 1.8,
                "end_time": 1.86,
                "score": 0.908
            },
            {
                "token": "好",
                "start_time": 2.16,
                "end_time": 2.22,
                "score": 0.988
            },
            {
                "token": "。",
                "start_time": 2.88,
                "end_time": 2.94,
                "score": 0.0
            }
        ]
    }
 }
 ```
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,4 @@
-torch>=2.9.0
+torchaudio
 torchaudio>=2.9.0
 transformers>=4.51.3
 funasr>=1.3.0
 zhconv
@ -7,3 +6,6 @@ whisper_normalizer
 pyopenjtalk-plus
 compute-wer
 openai-whisper
 fastapi
 uvicorn
 python-multipart==0.0.20