-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
145 lines (126 loc) · 5.12 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
import librosa
import numpy as np
from fuzzywuzzy import process
import nltk
from nltk.corpus import stopwords
from pydub import AudioSegment
import tkinter as tk
from tkinter import messagebox, simpledialog
import speech_recognition as sr # <-- Importing speech recognition
import os
import warnings
# Disable symlink warning
os.environ['HF_HUB_DISABLE_SYMLINKS_WARNING'] = '1'
# Suppress specific warnings that are non-critical
warnings.filterwarnings("ignore", category=UserWarning, module='pydub.utils')
warnings.filterwarnings("ignore", category=FutureWarning, module='transformers.tokenization_utils_base')
# Download NLTK data (only need to run once)
nltk.download('stopwords')
# Load the pre-trained Wav2Vec 2.0 model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
# Expanded list of predefined phrases
predefined_phrases = [
"Hello, how are you?",
"I need help with something",
"Can you please assist me?",
"What time is it?",
"Thank you very much",
"I'm feeling good today",
"Where is the bathroom?",
"I would like to eat something",
"What is your name?",
"How do I get there?",
"I need a doctor",
"Please call an ambulance",
"Can you please repeat that?",
"I do not understand",
"Can you speak slower?",
"I am lost",
"Where can I find help?",
"Can you guide me?",
"Is this the right way?",
"What is the weather like?",
"Can you open the window?",
"I am feeling cold",
"I am feeling hot",
"Can you turn on the lights?",
"Can you turn off the lights?",
"I need to go home",
"Where is the nearest hospital?",
"Can you tell me the time?",
"What day is it today?",
"Where is the nearest store?",
"How much does this cost?",
"I need some water",
"Can you help me find my keys?",
"Where is the restroom?",
"Can you help me cross the street?",
"Can you please be quiet?",
"Please help me find my way back",
"Can I have a glass of water?",
"Where can I find a taxi?",
"What is your phone number?"
]
def pre_process_audio(audio_data):
"""Advanced audio pre-processing with noise reduction and compression."""
y, sr = librosa.load(audio_data, sr=16000)
y = librosa.effects.preemphasis(y)
y = librosa.effects.trim(y)[0]
y = librosa.effects.time_stretch(y, rate=1.1)
return y, sr
def recognize_speech(audio_data):
"""Transcribe speech using Wav2Vec 2.0."""
y, sr = pre_process_audio(audio_data)
input_values = processor(y, return_tensors="pt", padding="longest", sampling_rate=sr).input_values
logits = model(input_values).logits
predicted_ids = torch.argmax(logits, dim=-1)
transcript = processor.batch_decode(predicted_ids, clean_up_tokenization_spaces=True)[0]
return transcript.lower()
def match_phrase(transcript):
"""Match the recognized text with predefined phrases using contextual NLP."""
stop_words = set(stopwords.words('english'))
filtered_words = [word for word in transcript.split() if word not in stop_words]
cleaned_transcript = ' '.join(filtered_words)
best_match = process.extractOne(cleaned_transcript, predefined_phrases)
return best_match[0] if best_match[1] > 70 else transcript
def feedback_loop(recognized_text):
"""Sophisticated feedback loop with continuous learning."""
feedback = messagebox.askquestion("Confirmation", f"Did you mean: '{recognized_text}'?")
if feedback == 'no':
correct_text = simpledialog.askstring("Correction", "Please enter the correct phrase:")
if correct_text:
predefined_phrases.append(correct_text)
return correct_text
return recognized_text
def start_recognition():
recognizer = sr.Recognizer()
# Check for microphone availability
if sr.Microphone.list_microphone_names():
mic = sr.Microphone()
with mic as source:
recognizer.adjust_for_ambient_noise(source)
audio = recognizer.listen(source)
audio_data = audio.get_wav_data()
# Save the audio to a file
with open("temp.wav", "wb") as f:
f.write(audio_data)
# Recognize speech
transcript = recognize_speech("temp.wav")
matched_phrase = match_phrase(transcript)
final_output = feedback_loop(matched_phrase)
messagebox.showinfo("Final Output", f"You said: {final_output}")
else:
messagebox.showerror("Error", "No microphone found. Please connect a microphone and try again.")
def create_gui():
root = tk.Tk()
root.title("Speech Recognition Assistant")
root.geometry("300x200")
start_button = tk.Button(root, text="Start Recognition", command=start_recognition, font=("Arial", 14))
start_button.pack(pady=50)
root.mainloop()
if __name__ == "__main__":
create_gui()