From 787975112603b0d4a152fd396cd4b12e5d04d0ee Mon Sep 17 00:00:00 2001
From: vera <511201264@qq.com>
Date: Tue, 10 Feb 2026 17:56:37 +0800
Subject: [PATCH] feat: api

---
 .github/workflows/ci-cd.yml |  33 ++++++++
 .gitignore                  |  38 ++++++++++
 Dockerfile                  |  21 ++++++
 api.py                      | 133 ++++++++++++++++++++++++++++++++
 pyproject.toml              |  27 +++++++
 readme_bw.md                | 147 ++++++++++++++++++++++++++++++++++++
 readme_bw_zh.md             | 147 ++++++++++++++++++++++++++++++++++++
 requirements.txt            |   6 +-
 8 files changed, 550 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/ci-cd.yml
 create mode 100644 .gitignore
 create mode 100644 Dockerfile
 create mode 100644 api.py
 create mode 100644 pyproject.toml
 create mode 100644 readme_bw.md
 create mode 100644 readme_bw_zh.md

diff --git a/.github/workflows/ci-cd.yml b/.github/workflows/ci-cd.yml
new file mode 100644
index 0000000..ceb2df5
--- /dev/null
+++ b/.github/workflows/ci-cd.yml
@@ -0,0 +1,33 @@
+name: Build container
+env:
+  VERSION: 0.0.1
+  REGISTRY: https://harbor.bwgdi.com
+  REGISTRY_NAME: harbor.bwgdi.com
+  REGISTRY_PATH: library
+  DOCKER_NAME: fun-asr
+on:
+  push:
+    branches:
+      - main
+  workflow_dispatch:
+jobs:
+  build-docker:
+    runs-on: builder-ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v3
+      - name: Login to Docker Hub
+        uses: docker/login-action@v3
+        with:
+          registry: ${{ env.REGISTRY }}
+          username: ${{ secrets.BWGDI_NAME }}
+          password: ${{ secrets.BWGDI_TOKEN }}
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v2
+      - name: Build and push
+        uses: docker/build-push-action@v4
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: ${{ env.REGISTRY_NAME }}/${{ env.REGISTRY_PATH }}/${{ env.DOCKER_NAME }}:${{ env.VERSION }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..d423de9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,38 @@
+# Python-generated files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# Distribution / packaging
+build/
+dist/
+wheels/
+*.egg-info/
+
+# Unit test / coverage reports
+.pytest_cache/
+.coverage
+htmlcov/
+coverage.xml
+
+# Logs
+*.log
+log/*.log
+
+# Virtual environments
+.venv/
+venv/
+env/
+
+# IDE settings
+.vscode/
+.idea/
+
+# OS generated files
+.DS_Store
+
+# Generated files
+*.wav
+*.pdf
+
+*.lock
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..5f91836
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,21 @@
+FROM python:3.12-slim
+RUN apt-get update && apt-get -y install \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/
+
+# Create app directory
+WORKDIR /app
+# Copy dependency definition files
+COPY pyproject.toml uv.lock ./
+
+# Install dependencies
+ENV UV_HTTP_TIMEOUT=1200
+RUN uv sync
+
+# Copy the rest of the application
+COPY . .
+
+EXPOSE 5000
+CMD [ "uv", "run", "api.py" ]
\ No newline at end of file
diff --git a/api.py b/api.py
new file mode 100644
index 0000000..7fd1a19
--- /dev/null
+++ b/api.py
@@ -0,0 +1,133 @@
+import os
+import shutil
+import uuid
+import torch
+import numpy as np
+import soundfile as sf
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+
+# 导入两种模式需要的库
+from funasr import AutoModel
+from model import FunASRNano
+from tools.utils import load_audio
+
+app = FastAPI(title="FunASR Dual-Mode API")
+
+# --- 环境配置 ---
+device = (
+    "cuda:0" if torch.cuda.is_available()
+    else "mps" if torch.backends.mps.is_available()
+    else "cpu"
+)
+MODEL_DIR = os.getenv("MODEL_DIR", "/models/Fun-ASR-Nano-2512")
+TEMP_DIR = "./temp_audio"
+os.makedirs(TEMP_DIR, exist_ok=True)
+
+# --- 模型全局初始化 ---
+print(f"正在加载 AutoModel (Mode 1)...")
+model_auto = AutoModel(
+    model=MODEL_DIR,
+    trust_remote_code=True,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device=device,
+    hub="ms"
+)
+
+print(f"正在加载 Direct Model (Mode 2)...")
+model_direct, direct_kwargs = FunASRNano.from_pretrained(model=MODEL_DIR, device=device)
+tokenizer = direct_kwargs.get("tokenizer", None)
+model_direct.eval()
+
+
+# --- 接口 1: Using FunASR for Inference ---
+@app.post("/inference/funasr")
+async def inference_funasr(
+    file: UploadFile = File(...),
+    language: str = Form("中文"),
+    itn: str = Form("true"),
+    hotwords: str = Form("")
+):
+    temp_path = save_temp_file(file)
+    try:
+        is_itn = True if itn.lower() in ["true", "1", "t"] else False
+        clean_lang = language.strip().strip('"')
+        clean_hw = hotwords.strip().strip('"')
+
+        # 核心修复点：不传 cache，且处理 hotwords
+        res = model_auto.generate(
+            input=temp_path,
+            batch_size=1,
+            hotwords=clean_hw if clean_hw else None,
+            language=clean_lang,
+            itn=is_itn,
+        )
+        
+        return {"status": "success", "text": res[0]["text"]}
+    except Exception as e:
+        import traceback
+        traceback.print_exc() 
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        remove_temp_file(temp_path)
+
+
+# --- 接口 2: Direct Inference ---
+@app.post("/inference/direct")
+async def inference_direct(
+    file: UploadFile = File(...),
+    chunk_mode: bool = Form(False)  # 是否开启你脚本2中的分片逻辑
+):
+    """直接调用 model.py 中的 FunASRNano 进行推理"""
+    temp_path = save_temp_file(file)
+    try:
+        if not chunk_mode:
+            # 模式 A: 标准直接推理
+            res = model_direct.inference(data_in=[temp_path], **direct_kwargs)
+            text = res[0][0]
+        else:
+            # 模式 B: 模拟脚本 2 中的分片循环逻辑
+            duration = sf.info(temp_path).duration
+            chunk_size = 0.72
+            cum_durations = np.arange(chunk_size, duration + chunk_size, chunk_size)
+            prev_text = ""
+            
+            for idx, cum_duration in enumerate(cum_durations):
+                audio, rate = load_audio(temp_path, 16000, duration=round(cum_duration, 3))
+                # 注意：这里调用的是模型内部的推理逻辑
+                step_res = model_direct.inference(
+                    [torch.tensor(audio).to(device)], 
+                    prev_text=prev_text, 
+                    **direct_kwargs
+                )
+                prev_text = step_res[0][0]["text"]
+                
+                # 脚本 2 中的特殊解码逻辑
+                if idx != len(cum_durations) - 1 and tokenizer:
+                    prev_text = tokenizer.decode(tokenizer.encode(prev_text)[:-5]).replace("", "")
+            
+            text = prev_text
+
+        return {"status": "success", "mode": "direct", "text": text}
+    
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+    finally:
+        remove_temp_file(temp_path)
+
+
+# --- 工具函数 ---
+def save_temp_file(upload_file):
+    ext = os.path.splitext(upload_file.filename)[1]
+    path = os.path.join(TEMP_DIR, f"{uuid.uuid4()}{ext}")
+    with open(path, "wb") as buffer:
+        shutil.copyfileobj(upload_file.file, buffer)
+    return path
+
+def remove_temp_file(path):
+    if os.path.exists(path):
+        os.remove(path)
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=5000)
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..2f0b1f9
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,27 @@
+[project]
+name = "Fun-ASR"
+version = "0.1.0"
+description = "语音识别/处理相关项目"
+readme = "README.md"
+requires-python = ">=3.12,<3.13"
+dependencies = [
+    "torch>=2.9.0",
+    "torchaudio>=2.9.0",
+    "transformers>=4.51.3",
+    "funasr>=1.3.0",
+    "zhconv",
+    "whisper_normalizer",
+    "pyopenjtalk-plus",
+    "compute-wer",
+    "openai-whisper",
+    "python-multipart==0.0.20",
+    "fastapi>=0.128.0",
+    "uvicorn>=0.40.0",
+]
+
+[tool.uv]
+package = false # 声明这只是一个应用程序，不是一个库
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
diff --git a/readme_bw.md b/readme_bw.md
new file mode 100644
index 0000000..408c56f
--- /dev/null
+++ b/readme_bw.md
@@ -0,0 +1,147 @@
+# FunASR Dual-Mode API
+
+This is a speech recognition (ASR) service built on FastAPI, integrating two inference modes of FunASR to provide flexible speech transcription capabilities.
+
+## Features
+
+The service provides two main inference interfaces:
+
+1.  **AutoModel Mode (`/inference/funasr`)**:
+    *   Uses the `funasr.AutoModel` high-level interface.
+    *   Integrates VAD (Voice Activity Detection).
+    *   Supports Hotwords enhancement.
+    *   Supports ITN (Inverse Text Normalization).
+    *   Supports multi-language configuration.
+
+2.  **Direct Model Mode (`/inference/direct`)**:
+    *   Directly calls the underlying `FunASRNano` model.
+    *   Supports standard full inference.
+    *   Supports simulated streaming/chunk inference (Chunk Mode) for testing the model's incremental decoding capabilities.
+
+## Environment Setup
+
+### Dependency Installation
+
+This project uses `uv` for dependency management. Please ensure `uv` is installed, then run the following command in the project root directory:
+
+```bash
+uv sync
+```
+
+### Model Configuration
+
+The default model path is configured as `/models/Fun-ASR-Nano-2512`. If your model is located elsewhere, please set the environment variable `MODEL_DIR`:
+
+```bash
+export MODEL_DIR="/your/absolute/path/to/model"
+```
+
+## Start Service
+
+You can start the service directly using the uv script (default port 5000):
+
+```bash
+uv run api.py
+```
+
+The service will automatically detect the computing device (CUDA > MPS > CPU) upon startup.
+
+### Docker Startup
+
+If deploying with Docker, you can refer to the following command. You can specify a custom model path using `-e MODEL_DIR`:
+
+```bash
+docker run -d --restart always -p 5000:5000 --gpus "device=1" \
+  -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \
+  --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \
+  harbor.bwgdi.com/library/fun-asr:0.0.1
+```
+
+## API Documentation
+
+### 1. FunASR Standard Inference Interface
+
+*   **URL**: `/inference/funasr`
+*   **Method**: `POST`
+*   **Content-Type**: `multipart/form-data`
+
+| Parameter Name | Type | Required | Default | Description |
+| :--- | :--- | :--- | :--- | :--- |
+| `file` | File | Yes | - | Audio file |
+| `language` | String | No | "中文" | Target language |
+| `itn` | String | No | "true" | Whether to enable Inverse Text Normalization (true/false) |
+| `hotwords` | String | No | "" | List of hotwords to improve recognition rate of specific vocabulary |
+
+**Example**:
+```bash
+curl -X POST "http://127.0.0.1:5000/inference/funasr" \
+  -F "file=@/path/to/audio.wav" \
+  -F "hotwords=开放时间"
+```
+
+### 2. Direct Underlying Inference Interface
+
+*   **URL**: `/inference/direct`
+*   **Method**: `POST`
+*   **Content-Type**: `multipart/form-data`
+
+| Parameter Name | Type | Required | Default | Description |
+| :--- | :--- | :--- | :--- | :--- |
+| `file` | File | Yes | - | Audio file |
+| `chunk_mode` | Boolean | No | False | Whether to enable chunk simulation mode (true/false) |
+
+**Example**:
+```bash
+# Enable chunk simulation mode
+curl -X POST "http://127.0.0.1:5000/inference/direct" \
+  -F "file=@/path/to/audio.wav" \
+  -F "chunk_mode=true"
+```
+**Response**:
+```json
+{
+    "status": "success",
+    "mode": "direct",
+    "text": {
+        "key": "rand_key_WgNZq6ITZM5jt",
+        "text": "你好。",
+        "text_tn": "你好",
+        "label": "null",
+        "ctc_text": "你好",
+        "ctc_timestamps": [
+            {
+                "token": "你",
+                "start_time": 1.8,
+                "end_time": 1.86,
+                "score": 0.908
+            },
+            {
+                "token": "好",
+                "start_time": 2.16,
+                "end_time": 2.22,
+                "score": 0.988
+            }
+        ],
+        "timestamps": [
+            {
+                "token": "你",
+                "start_time": 1.8,
+                "end_time": 1.86,
+                "score": 0.908
+            },
+            {
+                "token": "好",
+                "start_time": 2.16,
+                "end_time": 2.22,
+                "score": 0.988
+            },
+            {
+                "token": "。",
+                "start_time": 2.88,
+                "end_time": 2.94,
+                "score": 0.0
+            }
+        ]
+    }
+}
+```
\ No newline at end of file
diff --git a/readme_bw_zh.md b/readme_bw_zh.md
new file mode 100644
index 0000000..7e11671
--- /dev/null
+++ b/readme_bw_zh.md
@@ -0,0 +1,147 @@
+# FunASR Dual-Mode API
+
+这是一个基于 FastAPI 构建的语音识别（ASR）服务，集成了 FunASR 的两种推理模式，旨在提供灵活的语音转写能力。
+
+## 功能特性
+
+服务提供了两个主要的推理接口：
+
+1.  **AutoModel 模式 (`/inference/funasr`)**:
+    *   使用 `funasr.AutoModel` 高级接口。
+    *   集成 VAD（语音活动检测）。
+    *   支持热词（Hotwords）增强。
+    *   支持 ITN（逆文本标准化）。
+    *   支持多语言配置。
+
+2.  **Direct Model 模式 (`/inference/direct`)**:
+    *   直接调用底层 `FunASRNano` 模型。
+    *   支持普通全量推理。
+    *   支持模拟流式/分片推理（Chunk Mode），用于测试模型的增量解码能力。
+
+## 环境准备
+
+### 依赖安装
+
+本项目使用 `uv` 进行依赖管理。请确保已安装 `uv`，然后在项目根目录下运行：
+
+```bash
+uv sync
+```
+
+### 模型配置
+
+默认模型路径配置为 `/models/Fun-ASR-Nano-2512`。如果你的模型在其他位置，请设置环境变量 `MODEL_DIR`：
+
+```bash
+export MODEL_DIR="/你的/模型/绝对路径"
+```
+
+## 启动服务
+
+可以直接运行 uv 脚本启动（默认端口 5000）：
+
+```bash
+uv run api.py
+```
+
+服务启动时会自动检测计算设备（CUDA > MPS > CPU）。
+
+### Docker 启动
+
+若使用 Docker 部署，可参考以下命令。如需自定义模型路径，可通过 `-e MODEL_DIR` 指定：
+
+```bash
+docker run -d --restart always -p 5000:5000 --gpus "device=1" \
+  -e MODEL_DIR="/models/Fun-ASR-Nano-2512" \
+  --mount type=bind,source=/your/path/model/Fun-ASR-Nano-2512,target=/models/Fun-ASR-Nano-2512 \
+  harbor.bwgdi.com/library/fun-asr:0.0.1
+```
+
+## 接口文档
+
+### 1. FunASR 标准推理接口
+
+*   **URL**: `/inference/funasr`
+*   **Method**: `POST`
+*   **Content-Type**: `multipart/form-data`
+
+| 参数名 | 类型 | 必填 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- | :--- |
+| `file` | File | 是 | - | 音频文件 |
+| `language` | String | 否 | "中文" | 目标语言 |
+| `itn` | String | 否 | "true" | 是否开启逆文本标准化 (true/false) |
+| `hotwords` | String | 否 | "" | 热词列表，用于提升特定词汇识别率 |
+
+**示例**:
+```bash
+curl -X POST "http://127.0.0.1:5000/inference/funasr" \
+  -F "file=@/path/to/audio.wav" \
+  -F "hotwords=开放时间"
+```
+
+### 2. Direct 底层推理接口
+
+*   **URL**: `/inference/direct`
+*   **Method**: `POST`
+*   **Content-Type**: `multipart/form-data`
+
+| 参数名 | 类型 | 必填 | 默认值 | 说明 |
+| :--- | :--- | :--- | :--- | :--- |
+| `file` | File | 是 | - | 音频文件 |
+| `chunk_mode` | Boolean | 否 | False | 是否开启分片模拟模式 (true/false) |
+
+**示例**:
+```bash
+# 开启分片模拟模式
+curl -X POST "http://127.0.0.1:5000/inference/direct" \
+  -F "file=@/path/to/audio.wav" \
+  -F "chunk_mode=true"
+```
+**返回**:
+```json
+{
+    "status": "success",
+    "mode": "direct",
+    "text": {
+        "key": "rand_key_WgNZq6ITZM5jt",
+        "text": "你好。",
+        "text_tn": "你好",
+        "label": "null",
+        "ctc_text": "你好",
+        "ctc_timestamps": [
+            {
+                "token": "你",
+                "start_time": 1.8,
+                "end_time": 1.86,
+                "score": 0.908
+            },
+            {
+                "token": "好",
+                "start_time": 2.16,
+                "end_time": 2.22,
+                "score": 0.988
+            }
+        ],
+        "timestamps": [
+            {
+                "token": "你",
+                "start_time": 1.8,
+                "end_time": 1.86,
+                "score": 0.908
+            },
+            {
+                "token": "好",
+                "start_time": 2.16,
+                "end_time": 2.22,
+                "score": 0.988
+            },
+            {
+                "token": "。",
+                "start_time": 2.88,
+                "end_time": 2.94,
+                "score": 0.0
+            }
+        ]
+    }
+}
+```
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index eac7d4a..ef06201 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
-torch>=2.9.0
-torchaudio>=2.9.0
+torchaudio
 transformers>=4.51.3
 funasr>=1.3.0
 zhconv
@@ -7,3 +6,6 @@ whisper_normalizer
 pyopenjtalk-plus
 compute-wer
 openai-whisper
+fastapi
+uvicorn
+python-multipart==0.0.20
\ No newline at end of file