1. Add timestamp information to the returned result 2. The service initialization will automatically load the model

kenwaytis · kenwaytis · commit 3060a855f29a · 2023-07-19T16:45:22.000+08:00
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,10 @@
-FROM registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.6.0
-WORKDIR /modelscope/pytorch
+FROM paidax/dev-containers:modelscope-v0.6
+
+ARG HTTP_PROXY
+ENV HTTP_PROXY=${HTTP_PROXY}
+ENV HTTPS_PROXY=${HTTP_PROXY}
+
+WORKDIR /home/funasr
 
 RUN pip install --no-cache-dir \
     loguru \
@@ -11,5 +16,9 @@ RUN pip install --no-cache-dir \
 
 COPY . .
 
-RUN python download_model.py 
+RUN python download_model.py && \
+    git clone https://github.com/alibaba/FunASR.git && \
+    cd FunASR && \
+    pip install -e ./
 
+WORKDIR /home/funasr
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -4,8 +4,10 @@ services:
     build: 
       context: .
       dockerfile: Dockerfile
-    image: paidax/funasr_python:0.3.2
+      args:
+        HTTP_PROXY: ${HTTP_PROXY}
+    image: paidax/funasr_python:0.4.2
     runtime: nvidia
     ports:
       - 9527:9527
-    command: uvicorn main:app --port 9527 --host 0.0.0.0
+    command: tail -f /dev/null
diff --git a/download_model.py b/download_model.py
@@ -1,5 +1,6 @@
 from modelscope.hub.snapshot_download import snapshot_download
 
-snapshot_download("damo/speech_fsmn_vad_zh-cn-16k-common-pytorch")
-snapshot_download("damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
-snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+snapshot_download("damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") # 长音频模型
+snapshot_download("damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch") # 标点符号模型
+snapshot_download("damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch") # 语音端点检测模型
+snapshot_download("damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
diff --git a/main.py b/main.py
@@ -35,10 +35,11 @@ def initialize_model(model_type, hotword):
         loaded_model["model_type"] = "normal"
         model = pipeline(
             task=Tasks.auto_speech_recognition,
-            vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
             model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+            vad_model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
             # lm_model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch',
-            punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+            punc_model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
+            timestamp_model="damo/speech_timestamp_prediction-v1-16k-offline"
         )
     elif model_type == "long":
         log.debug("lodding model: long")
@@ -72,6 +73,10 @@ def load_model(model_type, hotword):
     if loaded_model["model_type"] is None or loaded_model["model_type"] != model_type or (loaded_model["model_type"] == "hotword" and hotword_parm["hotword"] != hotword):
         loaded_model["model"] = initialize_model(model_type, hotword)
 
+@app.on_event("startup")
+async def startup_event():
+    load_model(model_type="long", hotword=None)
+    rec_result = loaded_model["model"](audio_in="./16000_001.wav")
 
 @app.post("/asr", tags=["ASR"], summary="聚合ASR模型接口服务")
 async def predict(items: Audio):
@@ -93,14 +98,20 @@ async def predict(items: Audio):
         log.info(f"Received a url in string, url: {items.file}")
         decoded_data = requests.get(items.file).content
 
-    load_model(model_type=items.model_type, hotword=items.hotword)
+    load_model(model_type="long", hotword=items.hotword)
     rec_result = loaded_model["model"](audio_in=decoded_data)
-    if items.model_type=='normal' or items.model_type=='long':
-        rec_result = {
-            "text": rec_result["text"]
-        }
+    result = []
     log.info(rec_result)
-    return rec_result
+    for sentence in rec_result["sentences"]:
+        result.append(
+            {
+                "text": sentence["text"],
+                "start": sentence["start"] / 1000.0,
+                "end": sentence["end"] / 1000.0
+            }
+        )
+    log.info(result)
+    return result
 
 @app.get("/health")
 async def health_check():