From 787975112603b0d4a152fd396cd4b12e5d04d0ee Mon Sep 17 00:00:00 2001 From: vera <511201264@qq.com> Date: Tue, 10 Feb 2026 17:56:37 +0800 Subject: [PATCH] feat: api --- .github/workflows/ci-cd.yml | 33 ++++++++ .gitignore | 38 ++++++++++ Dockerfile | 21 ++++++ api.py | 133 ++++++++++++++++++++++++++++++++ pyproject.toml | 27 +++++++ readme_bw.md | 147 ++++++++++++++++++++++++++++++++++++ readme_bw_zh.md | 147 ++++++++++++++++++++++++++++++++++++ requirements.txt | 6 +- 8 files changed, 550 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/ci-cd.yml create mode 100644 .gitignore create mode 100644 Dockerfile create mode 100644 api.py create mode 100644 pyproject.toml create mode 100644 readme_bw.md create mode 100644 readme_bw_zh.md diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml new file mode 100644 index 0000000..ceb2df5 --- /dev/null +++ b/.github/workflows/ci-cd.yml @@ -0,0 +1,33 @@ +name: Build container +env: + VERSION: 0.0.1 + REGISTRY: https://harbor.bwgdi.com + REGISTRY_NAME: harbor.bwgdi.com + REGISTRY_PATH: library + DOCKER_NAME: fun-asr +on: + push: + branches: + - main + workflow_dispatch: +jobs: + build-docker: + runs-on: builder-ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v3 + - name: Login to Docker Hub + uses: docker/login-action@v3 + with: + registry: ${{ env.REGISTRY }} + username: ${{ secrets.BWGDI_NAME }} + password: ${{ secrets.BWGDI_TOKEN }} + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Build and push + uses: docker/build-push-action@v4 + with: + context: . + file: ./Dockerfile + push: true + tags: ${{ env.REGISTRY_NAME }}/${{ env.REGISTRY_PATH }}/${{ env.DOCKER_NAME }}:${{ env.VERSION }} diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d423de9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,38 @@ +# Python-generated files +__pycache__/ +*.py[cod] +*$py.class + +# Distribution / packaging +build/ +dist/ +wheels/ +*.egg-info/ + +# Unit test / coverage reports +.pytest_cache/ +.coverage +htmlcov/ +coverage.xml + +# Logs +*.log +log/*.log + +# Virtual environments +.venv/ +venv/ +env/ + +# IDE settings +.vscode/ +.idea/ + +# OS generated files +.DS_Store + +# Generated files +*.wav +*.pdf + +*.lock \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..5f91836 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,21 @@ +FROM python:3.12-slim +RUN apt-get update && apt-get -y install \ + ffmpeg \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Create app directory +WORKDIR /app +# Copy dependency definition files +COPY pyproject.toml uv.lock ./ + +# Install dependencies +ENV UV_HTTP_TIMEOUT=1200 +RUN uv sync + +# Copy the rest of the application +COPY . . + +EXPOSE 5000 +CMD [ "uv", "run", "api.py" ] \ No newline at end of file diff --git a/api.py b/api.py new file mode 100644 index 0000000..7fd1a19 --- /dev/null +++ b/api.py @@ -0,0 +1,133 @@ +import os +import shutil +import uuid +import torch +import numpy as np +import soundfile as sf +from fastapi import FastAPI, UploadFile, File, Form, HTTPException + +# 导入两种模式需要的库 +from funasr import AutoModel +from model import FunASRNano +from tools.utils import load_audio + +app = FastAPI(title="FunASR Dual-Mode API") + +# --- 环境配置 --- +device = ( + "cuda:0" if torch.cuda.is_available() + else "mps" if torch.backends.mps.is_available() + else "cpu" +) +MODEL_DIR = os.getenv("MODEL_DIR", "/models/Fun-ASR-Nano-2512") +TEMP_DIR = "./temp_audio" +os.makedirs(TEMP_DIR, exist_ok=True) + +# --- 模型全局初始化 --- +print(f"正在加载 AutoModel (Mode 1)...") +model_auto = AutoModel( + model=MODEL_DIR, + trust_remote_code=True, + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device=device, + hub="ms" +) + +print(f"正在加载 Direct Model (Mode 2)...") +model_direct, direct_kwargs = FunASRNano.from_pretrained(model=MODEL_DIR, device=device) +tokenizer = direct_kwargs.get("tokenizer", None) +model_direct.eval() + + +# --- 接口 1: Using FunASR for Inference --- +@app.post("/inference/funasr") +async def inference_funasr( + file: UploadFile = File(...), + language: str = Form("中文"), + itn: str = Form("true"), + hotwords: str = Form("") +): + temp_path = save_temp_file(file) + try: + is_itn = True if itn.lower() in ["true", "1", "t"] else False + clean_lang = language.strip().strip('"') + clean_hw = hotwords.strip().strip('"') + + # 核心修复点:不传 cache,且处理 hotwords + res = model_auto.generate( + input=temp_path, + batch_size=1, + hotwords=clean_hw if clean_hw else None, + language=clean_lang, + itn=is_itn, + ) + + return {"status": "success", "text": res[0]["text"]} + except Exception as e: + import traceback + traceback.print_exc() + raise HTTPException(status_code=500, detail=str(e)) + finally: + remove_temp_file(temp_path) + + +# --- 接口 2: Direct Inference --- +@app.post("/inference/direct") +async def inference_direct( + file: UploadFile = File(...), + chunk_mode: bool = Form(False) # 是否开启你脚本2中的分片逻辑 +): + """直接调用 model.py 中的 FunASRNano 进行推理""" + temp_path = save_temp_file(file) + try: + if not chunk_mode: + # 模式 A: 标准直接推理 + res = model_direct.inference(data_in=[temp_path], **direct_kwargs) + text = res[0][0] + else: + # 模式 B: 模拟脚本 2 中的分片循环逻辑 + duration = sf.info(temp_path).duration + chunk_size = 0.72 + cum_durations = np.arange(chunk_size, duration + chunk_size, chunk_size) + prev_text = "" + + for idx, cum_duration in enumerate(cum_durations): + audio, rate = load_audio(temp_path, 16000, duration=round(cum_duration, 3)) + # 注意:这里调用的是模型内部的推理逻辑 + step_res = model_direct.inference( + [torch.tensor(audio).to(device)], + prev_text=prev_text, + **direct_kwargs + ) + prev_text = step_res[0][0]["text"] + + # 脚本 2 中的特殊解码逻辑 + if idx != len(cum_durations) - 1 and tokenizer: + prev_text = tokenizer.decode(tokenizer.encode(prev_text)[:-5]).replace("", "") + + text = prev_text + + return {"status": "success", "mode": "direct", "text": text} + + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) + finally: + remove_temp_file(temp_path) + + +# --- 工具函数 --- +def save_temp_file(upload_file): + ext = os.path.splitext(upload_file.filename)[1] + path = os.path.join(TEMP_DIR, f"{uuid.uuid4()}{ext}") + with open(path, "wb") as buffer: + shutil.copyfileobj(upload_file.file, buffer) + return path + +def remove_temp_file(path): + if os.path.exists(path): + os.remove(path) + +if __name__ == "__main__": + import uvicorn + uvicorn.run(app, host="0.0.0.0", port=5000) diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..2f0b1f9 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,27 @@ +[project] +name = "Fun-ASR" +version = "0.1.0" +description = "语音识别/处理相关项目" +readme = "README.md" +requires-python = ">=3.12,<3.13" +dependencies = [ + "torch>=2.9.0", + "torchaudio>=2.9.0", + "transformers>=4.51.3", + "funasr>=1.3.0", + "zhconv", + "whisper_normalizer", + "pyopenjtalk-plus", + "compute-wer", + "openai-whisper", + "python-multipart==0.0.20", + "fastapi>=0.128.0", + "uvicorn>=0.40.0", +] + +[tool.uv] +package = false # 声明这只是一个应用程序,不是一个库 + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" diff --git a/readme_bw.md b/readme_bw.md new file mode 100644 index 0000000..408c56f --- /dev/null +++ b/readme_bw.md @@ -0,0 +1,147 @@ +# FunASR Dual-Mode API + +This is a speech recognition (ASR) service built on FastAPI, integrating two inference modes of FunASR to provide flexible speech transcription capabilities. + +## Features + +The service provides two main inference interfaces: + +1. **AutoModel Mode (`/inference/funasr`)**: + * Uses the `funasr.AutoModel` high-level interface. + * Integrates VAD (Voice Activity Detection). + * Supports Hotwords enhancement. + * Supports ITN (Inverse Text Normalization). + * Supports multi-language configuration. + +2. **Direct Model Mode (`/inference/direct`)**: + * Directly calls the underlying `FunASRNano` model. + * Supports standard full inference. + * Supports simulated streaming/chunk inference (Chunk Mode) for testing the model's incremental decoding capabilities. + +## Environment Setup + +### Dependency Installation + +This project uses `uv` for dependency management. Please ensure `uv` is installed, then run the following command in the project root directory: + +```bash +uv sync +``` + +### Model Configuration + +The default model path is configured as `/models/Fun-ASR-Nano-2512`. If your model is located elsewhere, please set the environment variable `MODEL_DIR`: + +```bash +export MODEL_DIR="/your/absolute/path/to/model" +``` + +## Start Service + +You can start the service directly using the uv script (default port 5000): + +```bash +uv run api.py +``` + +The service will automatically detect the computing device (CUDA > MPS > CPU) upon startup. + +### Docker Startup + +If deploying with Docker, you can refer to the following command. You can specify a custom model path using `-e MODEL_DIR`: + +```bash +docker run -d --restart always -p 5000:5000 --gpus "device=1" \ + -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \ + --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \ + harbor.bwgdi.com/library/fun-asr:0.0.1 +``` + +## API Documentation + +### 1. FunASR Standard Inference Interface + +* **URL**: `/inference/funasr` +* **Method**: `POST` +* **Content-Type**: `multipart/form-data` + +| Parameter Name | Type | Required | Default | Description | +| :--- | :--- | :--- | :--- | :--- | +| `file` | File | Yes | - | Audio file | +| `language` | String | No | "中文" | Target language | +| `itn` | String | No | "true" | Whether to enable Inverse Text Normalization (true/false) | +| `hotwords` | String | No | "" | List of hotwords to improve recognition rate of specific vocabulary | + +**Example**: +```bash +curl -X POST "http://127.0.0.1:5000/inference/funasr" \ + -F "file=@/path/to/audio.wav" \ + -F "hotwords=开放时间" +``` + +### 2. Direct Underlying Inference Interface + +* **URL**: `/inference/direct` +* **Method**: `POST` +* **Content-Type**: `multipart/form-data` + +| Parameter Name | Type | Required | Default | Description | +| :--- | :--- | :--- | :--- | :--- | +| `file` | File | Yes | - | Audio file | +| `chunk_mode` | Boolean | No | False | Whether to enable chunk simulation mode (true/false) | + +**Example**: +```bash +# Enable chunk simulation mode +curl -X POST "http://127.0.0.1:5000/inference/direct" \ + -F "file=@/path/to/audio.wav" \ + -F "chunk_mode=true" +``` +**Response**: +```json +{ + "status": "success", + "mode": "direct", + "text": { + "key": "rand_key_WgNZq6ITZM5jt", + "text": "你好。", + "text_tn": "你好", + "label": "null", + "ctc_text": "你好", + "ctc_timestamps": [ + { + "token": "你", + "start_time": 1.8, + "end_time": 1.86, + "score": 0.908 + }, + { + "token": "好", + "start_time": 2.16, + "end_time": 2.22, + "score": 0.988 + } + ], + "timestamps": [ + { + "token": "你", + "start_time": 1.8, + "end_time": 1.86, + "score": 0.908 + }, + { + "token": "好", + "start_time": 2.16, + "end_time": 2.22, + "score": 0.988 + }, + { + "token": "。", + "start_time": 2.88, + "end_time": 2.94, + "score": 0.0 + } + ] + } +} +``` \ No newline at end of file diff --git a/readme_bw_zh.md b/readme_bw_zh.md new file mode 100644 index 0000000..7e11671 --- /dev/null +++ b/readme_bw_zh.md @@ -0,0 +1,147 @@ +# FunASR Dual-Mode API + +这是一个基于 FastAPI 构建的语音识别(ASR)服务,集成了 FunASR 的两种推理模式,旨在提供灵活的语音转写能力。 + +## 功能特性 + +服务提供了两个主要的推理接口: + +1. **AutoModel 模式 (`/inference/funasr`)**: + * 使用 `funasr.AutoModel` 高级接口。 + * 集成 VAD(语音活动检测)。 + * 支持热词(Hotwords)增强。 + * 支持 ITN(逆文本标准化)。 + * 支持多语言配置。 + +2. **Direct Model 模式 (`/inference/direct`)**: + * 直接调用底层 `FunASRNano` 模型。 + * 支持普通全量推理。 + * 支持模拟流式/分片推理(Chunk Mode),用于测试模型的增量解码能力。 + +## 环境准备 + +### 依赖安装 + +本项目使用 `uv` 进行依赖管理。请确保已安装 `uv`,然后在项目根目录下运行: + +```bash +uv sync +``` + +### 模型配置 + +默认模型路径配置为 `/models/Fun-ASR-Nano-2512`。如果你的模型在其他位置,请设置环境变量 `MODEL_DIR`: + +```bash +export MODEL_DIR="/你的/模型/绝对路径" +``` + +## 启动服务 + +可以直接运行 uv 脚本启动(默认端口 5000): + +```bash +uv run api.py +``` + +服务启动时会自动检测计算设备(CUDA > MPS > CPU)。 + +### Docker 启动 + +若使用 Docker 部署,可参考以下命令。如需自定义模型路径,可通过 `-e MODEL_DIR` 指定: + +```bash +docker run -d --restart always -p 5000:5000 --gpus "device=1" \ + -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \ + --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \ + harbor.bwgdi.com/library/fun-asr:0.0.1 +``` + +## 接口文档 + +### 1. FunASR 标准推理接口 + +* **URL**: `/inference/funasr` +* **Method**: `POST` +* **Content-Type**: `multipart/form-data` + +| 参数名 | 类型 | 必填 | 默认值 | 说明 | +| :--- | :--- | :--- | :--- | :--- | +| `file` | File | 是 | - | 音频文件 | +| `language` | String | 否 | "中文" | 目标语言 | +| `itn` | String | 否 | "true" | 是否开启逆文本标准化 (true/false) | +| `hotwords` | String | 否 | "" | 热词列表,用于提升特定词汇识别率 | + +**示例**: +```bash +curl -X POST "http://127.0.0.1:5000/inference/funasr" \ + -F "file=@/path/to/audio.wav" \ + -F "hotwords=开放时间" +``` + +### 2. Direct 底层推理接口 + +* **URL**: `/inference/direct` +* **Method**: `POST` +* **Content-Type**: `multipart/form-data` + +| 参数名 | 类型 | 必填 | 默认值 | 说明 | +| :--- | :--- | :--- | :--- | :--- | +| `file` | File | 是 | - | 音频文件 | +| `chunk_mode` | Boolean | 否 | False | 是否开启分片模拟模式 (true/false) | + +**示例**: +```bash +# 开启分片模拟模式 +curl -X POST "http://127.0.0.1:5000/inference/direct" \ + -F "file=@/path/to/audio.wav" \ + -F "chunk_mode=true" +``` +**返回**: +```json +{ + "status": "success", + "mode": "direct", + "text": { + "key": "rand_key_WgNZq6ITZM5jt", + "text": "你好。", + "text_tn": "你好", + "label": "null", + "ctc_text": "你好", + "ctc_timestamps": [ + { + "token": "你", + "start_time": 1.8, + "end_time": 1.86, + "score": 0.908 + }, + { + "token": "好", + "start_time": 2.16, + "end_time": 2.22, + "score": 0.988 + } + ], + "timestamps": [ + { + "token": "你", + "start_time": 1.8, + "end_time": 1.86, + "score": 0.908 + }, + { + "token": "好", + "start_time": 2.16, + "end_time": 2.22, + "score": 0.988 + }, + { + "token": "。", + "start_time": 2.88, + "end_time": 2.94, + "score": 0.0 + } + ] + } +} +``` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index eac7d4a..ef06201 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ -torch>=2.9.0 -torchaudio>=2.9.0 +torchaudio transformers>=4.51.3 funasr>=1.3.0 zhconv @@ -7,3 +6,6 @@ whisper_normalizer pyopenjtalk-plus compute-wer openai-whisper +fastapi +uvicorn +python-multipart==0.0.20 \ No newline at end of file