import requests import base64 import time import wave import pyaudio import pygame import webrtcvad from openai import OpenAI # --------------------- 配置参数 --------------------- # 百度智能云 API 配置(请替换为您的 API Key 和 Secret Key) BAIDU_API_KEY = "4icZSO1OlMCU2ZiRMhgGCXFu" BAIDU_SECRET_KEY = "6wJldJ08m1jIX9hb0ULcJrIJ9D1OJW3c" # DeepSeek API 配置(请替换为您的 DeepSeek API Key) DEEPSEEK_API_KEY = "sk-f15b44b6b3344cdd820e59acebce9d2c" # 录音参数 CHUNK = 1024 FORMAT = pyaudio.paInt16 CHANNELS = 1 RATE = 16000 # 设备 ID(可以随意设定) DEVICE_ID = "raspberry_pi" # --------------------- 工具函数 --------------------- def get_baidu_token(): """获取百度智能云 API 访问令牌""" url = "https://aip.baidubce.com/oauth/2.0/token" params = { "grant_type": "client_credentials", "client_id": BAIDU_API_KEY, "client_secret": BAIDU_SECRET_KEY, } response = requests.post(url, data=params) if response.status_code == 200: return response.json().get("access_token") print("获取百度 Token 失败") return None def record_audio_vad(filename, max_duration=10): """使用 WebRTC VAD 语音活动检测实现自动录音""" vad = webrtcvad.Vad(1) # 设置 VAD 灵敏度(0-3,越大越严格) p = pyaudio.PyAudio() # 🛠 **确保录音格式符合 WebRTC VAD 要求** stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=320) print("开始录音(自动检测静音停止)...") frames = [] silence_count = 0 max_silence = 150 # 允许最多 1 秒静音(30 帧) while True: data = stream.read(320, exception_on_overflow=False) # **🛠 WebRTC VAD 需要 10ms 帧大小** frames.append(data) # 检查是否有语音 is_speech = vad.is_speech(data, 16000) silence_count = 0 if is_speech else silence_count + 1 if silence_count > max_silence: # 如果连续 1 秒静音,则停止录音 print("检测到静音,录音结束。") break # 限制最大录音时长 if len(frames) > int((16000 / 320) * max_duration): print("达到最大录音时长,录音结束。") break stream.stop_stream() stream.close() p.terminate() # 保存录音为 WAV 文件 wf = wave.open(filename, 'wb') wf.setnchannels(1) wf.setsampwidth(p.get_sample_size(pyaudio.paInt16)) wf.setframerate(16000) wf.writeframes(b''.join(frames)) wf.close() def speech_recognition(audio_file, token): """调用百度语音识别 API 将音频转文本""" with open(audio_file, "rb") as f: speech_data = f.read() speech_base64 = base64.b64encode(speech_data).decode('utf-8') payload = { "format": "wav", "rate": RATE, "channel": 1, "token": token, "cuid": DEVICE_ID, "len": len(speech_data), "speech": speech_base64, "word_list": ["小智","小志","小至"] # 添加热词,提高识别准确率 } url = "http://vop.baidu.com/server_api" headers = {'Content-Type': 'application/json'} response = requests.post(url, json=payload, headers=headers) result = response.json() if result.get("err_no") == 0: return result.get("result", [""])[0] print("语音识别错误:", result.get("err_msg")) return None def wake_word_detected(text): """检查文本是否包含唤醒词""" wake_words = ["小智", "小志", "小知", "晓智"] return any(word in text for word in wake_words) def deepseek_conversation(user_text): """调用 DeepSeek API 获取 AI 回答""" try: client = OpenAI(api_key=DEEPSEEK_API_KEY, base_url="https://api.deepseek.com") response = client.chat.completions.create( model="deepseek-chat", messages=[{"role": "system", "content": "你是一名叫小智的助手,回复不需要使用markdown格式,请直接以文本形式回复。"}, {"role": "user", "content": user_text}], stream=False ) return response.choices[0].message.content except Exception as e: print("DeepSeek API 调用异常:", e) return "抱歉,我无法获取答案。" def text_to_speech(text, token, output_file="answer.mp3"): """调用百度语音合成 API,将文本转换为语音""" MAX_CHAR = 1024 text = text[:MAX_CHAR] if len(text) > MAX_CHAR else text params = { "tex": text, "tok": token, "cuid": DEVICE_ID, "ctp": 1, "lan": "zh" } url = "http://tsn.baidu.com/text2audio" response = requests.post(url, data=params) if response.headers.get('Content-Type') == "audio/mp3": with open(output_file, "wb") as f: f.write(response.content) return output_file print("语音合成错误:", response.text) return None def play_audio(file_path): """播放音频文件""" pygame.mixer.init() pygame.mixer.music.load(file_path) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): time.sleep(0.1) # --------------------- 主程序 --------------------- def main(): print("启动智能助手小智...") token = get_baidu_token() if not token: return while True: print("等待唤醒词 '小智' ...") record_audio_vad("wake.wav", max_duration=3) wake_text = speech_recognition("wake.wav", token) if wake_text and "小智" in wake_text: print("唤醒成功,小智回应:好的,小智在。") # 语音合成回应 "好的,小智在。" response_audio = text_to_speech("好的,小智在。请说出你的问题。", token, output_file="wakeup_response.mp3") if response_audio: play_audio(response_audio) # 播放唤醒成功音频 print("请说出您的问题:") record_audio_vad(filename="query.wav") user_query = speech_recognition("query.wav", token) if user_query: print("用户说:", user_query) # 使用 DeepSeek-R1 模型获取回答 answer = deepseek_conversation(user_query) print("小智回答:", answer) # 使用百度语音合成将回答转换为语音 audio_file = text_to_speech(answer, token, output_file="answer.mp3") if audio_file: play_audio(audio_file) else: print("未能识别您的问题,请重试。") time.sleep(1) if __name__ == '__main__': main()