# jarvis-models ## Conda Environment and Python Library Requirement ```bash conda create -n jarvis-models python==3.10.11 pip install -r sample/requirement_out_of_pytorch.txt pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118 ``` ## More Dependencies | System | package | web | install command | | --- |-----------------------| --- | --- | | python | filetype | https://pypi.org/project/filetype/ | pip install filetype | | python | fastAPI | https://fastapi.tiangolo.com/ | pip install fastapi | | python | python-multipart | https://pypi.org/project/python-multipart/ | pip install python-multipart | | python | uvicorn | https://www.uvicorn.org/ | pip install "uvicorn[standard]" | | python | SpeechRecognition | https://pypi.org/project/SpeechRecognition/ | pip install SpeechRecognition | | python | gtts | https://pypi.org/project/gTTS/ | pip install gTTS | | python | PyYAML | https://pypi.org/project/PyYAML/ | pip install PyYAML | | python | injector | https://github.com/python-injector/injector | pip install injector | | python | langchain | https://github.com/langchain-ai/langchain | pip install langchain | | python | chromadb | https://docs.trychroma.com/getting-started | pip install chromadb | | python | lagent | https://github.com/InternLM/lagent/blob/main/README.md | pip install lagent | | python | sentence_transformers | https://github.com/InternLM/lagent/blob/main/README.md | pip install sentence_transformers | ## Start Start the jarvis-models service via ```bash uvicorn main:app --reload ``` or ```bash python main.py ``` ## Configuration Create ".env.yaml" at the root of jarvis-models, and copy the following yaml configuration ```yaml env: version: 0.0.1 host: 0.0.0.0 port: 8000 log: level: debug time_format: "%Y-%m-%d %H:%M:%S" filename: "D:/Workspace/Logging/jarvis/jarvis-models.log" melotts: mode: local # or docker url: http://10.6.44.141:18080/convert/tts speed: 0.9 device: 'cuda:0' language: 'ZH' speaker: 'ZH' cosyvoicetts: mode: local # or docker url: http://10.6.44.141:18080/convert/tts speed: 0.9 device: 'cuda:0' language: '粤语女' speaker: 'ZH' sovitstts: mode: docker url: http://10.6.80.90:9880/tts speed: 0.9 device: 'cuda:0' language: 'ZH' speaker: 'ZH' text_lang: "yue" ref_audio_path: "output/slicer_opt/Ricky-Wong/Ricky-Wong-3-Mins.wav_0006003840_0006134080.wav" prompt_lang: "yue" prompt_text: "你失敗咗點算啊?你而家安安穩穩,點解要咁樣做呢?" text_split_method: "cut5" batch_size: 1 media_type: "wav" streaming_mode: True sensevoiceasr: mode: local # or docker url: http://10.6.44.141:18080/convert/tts speed: 0.9 device: 'cuda:0' language: '粤语女' speaker: 'ZH' tesou: url: http://120.196.116.194:48891/chat/ TokenIDConverter: token_path: src/asr/resources/models/token_list.pkl unk_symbol: CharTokenizer: symbol_value: space_symbol: remove_non_linguistic_symbols: false WavFrontend: cmvn_file: src/asr/resources/models/am.mvn frontend_conf: fs: 16000 window: hamming n_mels: 80 frame_length: 25 frame_shift: 10 lfr_m: 7 lfr_n: 6 filter_length_max: -.inf dither: 0.0 Model: model_path: src/asr/resources/models/model.onnx use_cuda: false CUDAExecutionProvider: device_id: 0 arena_extend_strategy: kNextPowerOfTwo cudnn_conv_algo_search: EXHAUSTIVE do_copy_in_default_stream: true batch_size: 3 blackbox: lazyloading: true vlms: urls: qwen_vl: http://10.6.80.87:8000 qwen2_vl: http://10.6.80.87:23333 qwen2_vl_72b: http://10.6.80.91:23333 path: chroma_rerank_embedding_model: /media/verachen/e0f7a88c-ad43-4736-8829-4d06e5ed8f4f/model/BAAI cosyvoice_path: /media/verachen/e0f7a88c-ad43-4736-8829-4d06e5ed8f4f/Workspace/CosyVoice cosyvoice_model_path: /media/verachen/e0f7a88c-ad43-4736-8829-4d06e5ed8f4f/model/Voice/CosyVoice/pretrained_models sensevoice_model_path: /media/verachen/e0f7a88c-ad43-4736-8829-4d06e5ed8f4f/model/Voice/SenseVoice/SenseVoiceSmall ```