python实时语音转文字,python 语音转文字开源

因为项目中有需要把微信里的语音转成文本处理, 本次只说语音转文本。

需要注意的是平台对语音的格式有要求, 所以我们需要对语音进行转换格式。

语音转换

使用的工具是ffmpeg, ffmpeg的安装和配置请自行百度。

import os

import tempfile

import subprocess

import base64

import logging

logger = logging.getLogger(__name__)

def mp3_2_wav(_path = None, _byte = None):

''' MP3转WAV

_path和_byte必须存在一个, 优先级_path > _byte

:param _path:

:param _byte:

:return: wav的字节流

'''

try:

if _path is None and _byte is None: return

temp = None

if _path is None: # 字节流存入临时文件

temp = tempfile.NamedTemporaryFile(mode="w+b", delete=False)

temp.write(_byte)

temp.seek(0)

_path = temp.name

if _path is None: return

# 根据要求进行格式转换,-t 60 最大保存60秒, 采样率 16K, 默认单声道

logger.info('mp3 ==> wav ========================')

target_file = tempfile.NamedTemporaryFile(mode="w+b", delete=False, suffix='.wav')

_perfix = r'ffmpeg'

command = [_perfix, '-y', '-t', '60', '-i', _path, '-ar', '16K', target_file.name]

return_code = subprocess.call(command)

logger.info('mp3 ==> wav ==={}====================='.format(return_code))

if return_code == 0:

target_file.seek(0)

_byte = target_file.read()

target_file.close()

os.remove(target_file.name)

if temp is not None:

temp.close()

os.remove(temp.name)

return _byte

except Exception as e:

logger.error('mp3_2_wav error [{}]'.format(e))

百度AI开放平台 API

百度的比较方便，有百度封装好的SDK使用和安装都很方便, 上面API里有详细介绍。

def BAIDU_ASR(_path):

''' 百度语音转文字

:param _path:

:return:

'''

from aip import AipSpeech

""" 你的 APPID AK SK """

APP_ID = '你的 App ID'

API_KEY = '你的 Api Key'

SECRET_KEY = '你的 Secret Key'

client = AipSpeech(APP_ID, API_KEY, SECRET_KEY)

result = client.asr(mp3_2_wav(_path), 'pcm', 16000, {

'dev_pid': '1537',

})

return result.get('result')[0]

讯飞开放平台API

需要把请求的IP在讯飞控制台加入白名单, 否则报错：

{"code":"10105","data":"","desc":"illegal access|illegal client_ip: xxx.xxx.xxx.xxx","sid":"....."}

把报错信息里的xxx.xxx.xxx.xxx加入白名单即可

import json

import time

import requests

def XUNFEI_ASR(_path):

''' 讯飞语音转文字

:param _path:

:return:

'''

_byte = mp3_2_wav(_path)

base64_audio = base64.b64encode(_byte)

import urllib.parse

body = urllib.parse.urlencode({'audio': base64_audio})

url = 'http://api.xfyun.cn/v1/service/v1/iat'

APP_ID = '你的 App ID'

API_KEY = '你的 Api Key'

param = {"engine_type": "sms16k", "aue": "raw"}

import hashlib

x_param = base64.b64encode(json.dumps(param).replace(' ', '').encode('utf-8'))

x_time = int(int(round(time.time() * 1000)) / 1000)

_str = API_KEY + str(x_time) + x_param.decode('utf-8')

x_checksum = hashlib.md5(_str.encode('utf-8')).hexdigest()

x_header = {'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',

'X-Appid': APP_ID,

'X-CurTime': str(x_time),

'X-Param': x_param,

'X-CheckSum': x_checksum}

res = requests.post(url, body, headers = x_header)

res = res.content.decode('utf-8')

return res

腾讯AI开放平台API

腾讯平台主要是签名算法需要注意

签名算法

def get_sign_code(params, app_key):

''' 生成签名CODE

1. 计算步骤

用于计算签名的参数在不同接口之间会有差异，但算法过程固定如下4个步骤。

将请求参数对按key进行字典升序排序，得到有序的参数对列表N

将列表N中的参数对按URL键值对的格式拼接成字符串，得到字符串T（如：key1=value1&key2=value2），URL键值拼接过程value部分需要URL编码，URL编码算法用大写字母，例如%E8，而不是小写%e8

将应用密钥以app_key为键名，组成URL键值拼接到字符串T末尾，得到字符串S（如：key1=value1&key2=value2&app_key=密钥)

对字符串S进行MD5运算，将得到的MD5值所有字符转换成大写，得到接口请求签名

2. 注意事项

不同接口要求的参数对不一样，计算签名使用的参数对也不一样

参数名区分大小写，参数值为空不参与签名

URL键值拼接过程value部分需要URL编码

签名有效期5分钟，需要请求接口时刻实时计算签名信息

:param params: 参数字典

:param app_key:

:return:

'''

if params is None or type(params) != dict or len(params) == 0: return

try:

params = sorted(params.items(), key=lambda x:x[0])

_str = ''

for item in params:

key = item[0]

value = item[1]

if value == '': continue

_str += urllib.parse.urlencode({key: value}) + '&'

_str += 'app_key=' + app_key

_str = hashlib.md5(_str.encode('utf-8')).hexdigest()

return _str.upper()

except Exception as e:

logger.error('tencen get_sign_code error [{}]'.format(e))

ASR

def TencenASR(_path):

''' 腾讯语音转文字

:param _path:

:return:

'''

APP_ID = '你的 App ID'

API_KEY = '你的 Api Key'

_byte = mp3_2_wav(_path)

base64_audio = base64.b64encode(_byte)

url = 'https://api.ai.qq.com/fcgi-xxdzp/aai/aai_asr'

params = {'app_id': APP_ID, 'time_stamp':int(time.time()), 'nonce_str': 'fa577ce340859f9fe', 'format': 2, 'speech': base64_audio, 'rate': 16000}

sign = get_sign_code(params, API_KEY)

params['sign'] = sign

rt = requests.post(url, data=params)

jsonData = json.loads(rt.text)

if jsonData.get('ret') == 0:

rt = jsonData.get('data').get('text')

logger.info('Tencen ==> wav ===> {}'.format(rt))

return rt

至此关键代码开发完成。比较效果使用。