Reconhecimento de voz com asterisk

A idéia é utilizar EAGI para controle do canal de entrada de áudio em conjunto com o File Descriptor, o Asterisk entrega o áudio em formato RAW diretamente no File Descriptor 3, então podemos utilizar esta informação da maneira que acharmos conveniente, para este caso a manipulação se torna muito prática, o que me desprende totalmente das APP’s prontas para gravações inseridas no Asterisk Ex. Record, nada melhor do que ser livre para voar, é claro várias análises se tornam possíveis com isso e o leque de aplicações possíveis se tornam infinitas.

Estou usando novamente o módulo audiolab para efetuar o encode do áudio em FLAC, caso exista alguma dificuldade para a instalação deste módulo poderei pensar em adaptar o código para uso externo do sox ou flac.

Como ele funciona?

Atende uma ligação
O usuário tem no máximo 10 segundos para efetuar a fala
Caso nao encontre atividade de voz encerra com timeout
Estratégia para atividade de voz verdadeira para os seguintes valores RMS > 15 e Pitch > 75
Se atividade for encontrada o usuário poderá falar por no máximo 10 segundos
O script verifica blocos em tempo real com amostras de 1 em 1 segundo e verifica se a fala cessou
Caso sim o script interrompe a gravação automáticamente e envia o que foi gravado para o google
Caso não o script continua o seu curso até seu máximo de 10 segundos
Apos encontrada a resposta da fala no google o script seta a variável “GoogleUtterance”

Instalacao:

Dependencies:

apt-get install python-matplotlib
apt-get install python-numpy
apt-get install python-scipy
apt-get install python-dev python-setuptools libsndfile-dev

Download and install audiolab from:
http://pypi.python.org/pypi/scikits.audiolab/

Example how use in dialplan from Asterisk:
Extensions.conf

exten=>_11111111,1,Answer()
exten=>_11111111,n,eagi,pahh.py
exten=>_11111111,n,GotoIf($[${EXISTS(${GoogleUtterance})}]?hello:bye)
exten=>_11111111,n(hello),NoOP(You Said = ${GoogleUtterance})
exten=>_11111111,n(bye),Hangup()

Fiz um reconhecimento com comparacao:
exten=>_1,1,Answer()
exten=>_1,n,eagi(pahh.py)
exten=>_1,n,GotoIf($[${EXISTS(${GoogleUtterance})}]?hello:bye)
exten=>_1,n(hello),NoOP(You Said = ${GoogleUtterance})
exten=>_1,n(hello),GotoIf($[“${GoogleUtterance}” = “9 0 8”]?acertei,s,1)
exten=>_1,n(hello),GotoIf($[“${GoogleUtterance}” = “9 0 5”]?acertei,s,100)
exten=>_1,n(hello),GotoIf($[“${GoogleUtterance}” = “9 1 3”]?acertei,s,200)
exten=>_1,n(bye),Hangup()

; tratei a comparacao:
[acertei]
exten => s,1,Dial(DAHDI/8,20)
exten => s,100,Dial(DAHDI/5,20)
exten => s,200,Dial(DAHDI/13,20)

Criar o script com nome pahh.py e colocar na pasta /var/lib/asterisk/agi-bin
Efetuar o comando chmod +x /var/lib/asterisk/agi-bin/pahh.py

Script pahh.py abaixo:
#!/usr/bin/python
#Copyright (c) 2012, Eng Eder de Souza
#Accessing the Google API for speech recognition With Asterisk!
#Eng Eder de Souza
#date 15/01/2012
#http://ederwander.wordpress.com/2012/01/16/google-speech-python-asterisk/
#
# This program is free software, distributed under the terms of
# the GNU General Public License Version 2. See the COPYING file
# at the top of the source tree.
#
#Revision 0.2
#History:
#18/01/2012 bug fix in local variable declaration
#19/01/2012 suport for old python interpretator
#19/01/2012 removed matplotlib dependencies
#19/01/2012 Submission of warnings DeprecationWarning and UserWarning

import warnings
warnings.simplefilter(“ignore”, DeprecationWarning)
warnings.simplefilter(“ignore”, UserWarning)
from scikits.audiolab import Format, Sndfile
from scipy.signal import firwin, lfilter
from tempfile import mkstemp
import numpy as np
import urllib2
import math
import sys
import re
import os

#For Portuguese Brazilian Speech Recognizer!
Lang=”pt-BR”

#or for English Speech Recognizer
#Lang=”en-US”

url = ‘https://www.google.com/speech-api/v1/recognize?xjerr=1&client=chromium&lang=’+Lang

silence=True
env = {}
RawRate=8000
chunk=1024

#http://en.wikipedia.org/wiki/Vocal_range
#Assuming Vocal Range Frequency upper than 75 Hz
VocalRange = 75.0

#cd, FileNameTmp = mkstemp(‘TmpSpeechFile.flac’)

#Assuming Energy threshold upper than 15 dB
Threshold = 15

#10 seconds x 16000 samples/second x ( 16 bits / 8bits/byte ) = 160000 bytes
#160000/1024 = +/- 157
#157*1024 = 160768
TimeoutSignal = 160768

#then 1 second x 16000 = 16000
#16000/1024 = 15,625 round to 16
#16*1024 = 16384
Timeout_NoSpeaking=16384

#normalization for RMS Calc
SHORT_NORMALIZE = (1.0/32768.0)

#
LastBlock=”

#File Descriptor delivery in Asterisk
FD=3

#Open File Descriptor
file=os.fdopen(FD, ‘rb’)

signal=0

all=[]

while 1:
line = sys.stdin.readline().strip()

if line == ”:
break
key,data = line.split(‘:’)
if key[:4] <> ‘agi_’:
sys.stderr.write(“Did not work!\n”);
sys.stderr.flush()
continue
key = key.strip()
data = data.strip()
if key <> ”:
env[key] = data

for key in env.keys():
sys.stderr.write(” — %s = %s\n” % (key, env[key]))
sys.stderr.flush()

def SendSpeech(File):
flac=open(File,”rb”).read()
os.remove(File)
header = {‘Content-Type’ : ‘audio/x-flac; rate=8000’}
req = urllib2.Request(url, flac, header)
data = urllib2.urlopen(req)
find = re.findall(‘”utterance”:(.*),’, data.read())
#utterance
try:
result = find[0].replace(‘”‘, ”)
except:
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “speech not recognized …” + “\” ” + “\n”)
sys.stdout.flush()
if result:
sys.stdout.write(‘SET VARIABLE GoogleUtterance “%s”\n’% str(result))
sys.stdout.flush()
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” “%s \n”% str(result))
sys.stdout.flush()

def Filter(samps):
FC = 0.05/(0.5*RawRate)
N = 200
a = 1
b = firwin(N, cutoff=FC, window=’hamming’)
return lfilter(b, a, samps)

def Pitch(signal):
if sys.version_info < (2, 6): crossing =[] for s in signal: crossing.append(s) else: crossing = [math.copysign(1.0, s) for s in signal] #index = find(np.diff(crossing)); index = np.nonzero(np.diff(crossing)); index=np.array(index)[0].tolist() f0=round(len(index) *RawRate /(2*np.prod(len(signal)))) return f0; def rms(shorts): rms2=0 count = len(shorts)/2 sum_squares = 0.0 for sample in shorts: n = sample * SHORT_NORMALIZE sum_squares += n*n rms2 = math.pow(sum_squares/count,0.5) return rms2 * 1000 def speaking(data): rms_value = rms(data) if rms_value > Threshold:
return True
else:
return False

def VAD(SumFrequency, data2):
AVGFrequency = SumFrequency/(Timeout_NoSpeaking+1);
if AVGFrequency > VocalRange/2:
S=speaking(data2)
if S:
return True;
else:
return False;

else:
return False;

def RecordSpeech(TimeoutSignal, LastBlock, LastLastBlock):
for s in LastLastBlock:
all.append(s)
for s in LastBlock:
all.append(s)
signal=0;
while signal <= TimeoutSignal: RawSamps = file.read(Timeout_NoSpeaking) samps = np.fromstring(RawSamps, dtype=np.int16) for s in samps: all.append(s) signal = signal + Timeout_NoSpeaking; #rms_value=rms(samps) Speech=speaking(samps) #sys.stdout.write("EXEC NOOP %s \"\"\"\n"% str(rms_value)) #sys.stdout.flush() #if rms_value > Threshold:
if Speech:
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “Speech Found …” + “\” ” + “\n”)
sys.stdout.flush()
else:
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “End of the Speech…” + “\” ” + “\n”)
sys.stdout.flush()
signal=TimeoutSignal+1

def PlayStream (params):
sys.stderr.write(“STREAM FILE %s \”\”\n” % str(params))
sys.stderr.flush()
sys.stdout.write(“STREAM FILE %s \”\”\n” % str(params))
sys.stdout.flush()
result = sys.stdin.readline().strip()

sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “Hello Waiting For Speech …” + “\” ” + “\n”)
sys.stdout.flush()

PlayStream(“beep”);
sys.stdout.flush()

while silence:
#Input Real-time Data Raw Audio from Asterisk
RawSamps = file.read(chunk)
samps = np.fromstring(RawSamps, dtype=np.int16)
samps2=Filter(samps)
Frequency=Pitch(samps2)
rms_value=rms(samps)
signal = signal + chunk;
if (rms_value > Threshold) and (Frequency > VocalRange):
silence=False
LastLastBlock=LastBlock
LastBlock=samps
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “Speech Detected Recording…” + “\” ” + “\n”)
sys.stdout.flush()
if (signal > TimeoutSignal):
sys.stdout.write(“EXEC ” + “\”” + “NOOP” + “\” \”” + “Time Out No Speech Detected …” + “\” ” + “\n”)
sys.stdout.flush()
sys.exit()

RecordSpeech(TimeoutSignal, LastBlock, LastLastBlock)

array = np.array(all)

fmt = Format(‘flac’, ‘pcm16’)
nchannels = 1

cd, FileNameTmp = mkstemp(‘TmpSpeechFile.flac’)

# making the file .flac
afile = Sndfile(FileNameTmp, ‘w’, fmt, nchannels, RawRate)

#writing in the file
afile.write_frames(array)

SendSpeech(FileNameTmp)

# FIM ———- CORTE AQUI —————–

Creditos: Eng Eder Wander

Deixe um comentário