Google text to speech and speech to text speaker diarization with AI

Question

I am designing a python math project for my students, where they are given a topic e.g. properties of a circle and they should be able to ask interact with AI to receive assistance on anything they don't understand. It should be like a conversation between the AI and student. The idea is the human can ask questions, this is converted to text by google speech to text, the text is passed on to open ai gpt to provide a response, the response is converted to speech via google text to speech.. The problem I have it the AI is unable to distinguish between human voice and AI voice so once it responds, it will assume the human has spoken and will essentially respond to itself.. I am not sure how to fix this.. Please help. Apologies for the huge amount of code which I have attached, but I am really stuck...:

`# Initialize Dialogflow, TTS, and Speech clients
DIALOGFLOW_PROJECT_ID = 'your-dialogflow-project-id'
DIALOGFLOW_LANGUAGE_CODE = 'en-GB'
SESSION_ID = 'unique-session-id'
DIALOGFLOW_CLIENT = dialogflow.SessionsClient()
SESSION = DIALOGFLOW_CLIENT.session_path(DIALOGFLOW_PROJECT_ID, SESSION_ID)

TTS_CLIENT = texttospeech.TextToSpeechClient()
speech_client = speech.SpeechClient()
logger = logging.getLogger(__name__)

# Define a timeout duration (e.g., 60 seconds)
INTERACTION_TIMEOUT = 60  # seconds

def circle_view(request):
    return render(request, 'pdf/circle.html')

@csrf_exempt
def start_interaction(request):
    if request.method == 'POST':
        # Initialize session variables
        request.session['current_part'] = 'circumference'
        request.session['completed_parts'] = {
            'circumference': False,
            'radius': False,
            'diameter': False
        }
        request.session['last_interaction'] = time.time()
        request.session['awaiting_ai_response'] = False  # New session variable

        welcome_prompt = """
        Welcome to the circle section of AI Maths! 
        We'll be learning about three key parts of a circle: the circumference, the radius, and the diameter. 
        You can ask questions anytime you're unsure.
        """
        instruction_prompt = """
        Let's start by identifying the circumference. 
        Please click on the circumference line in the diagram.
        """
        full_prompt = welcome_prompt + instruction_prompt
        return generate_speech_response(full_prompt, speaker='AI', request=request)
    
    return JsonResponse({'error': 'Invalid request method'}, status=405)

@csrf_exempt
@csrf_exempt
def handle_circle_click(request):
    if request.method == 'POST':
        # Reset flag indicating we are not waiting for an AI response
        request.session['awaiting_ai_response'] = False
        logger.debug("Resetting awaiting_ai_response flag to False.")
        
        # Check if interaction has timed out
        last_interaction = request.session.get('last_interaction', 0)
        if time.time() - last_interaction > INTERACTION_TIMEOUT:
            return JsonResponse({'error': 'Interaction timeout. Please restart the interaction.'}, status=408)

        # Update last interaction time
        request.session['last_interaction'] = time.time()

        # Process the request
        try:
            data = json.loads(request.body)
            element_id = data.get('elementId')
            current_part = request.session.get('current_part', 'circumference')

            if not element_id:
                return JsonResponse({'error': 'Missing elementId in request.'}, status=400)

            prompt = ""

            if element_id == current_part:
                if current_part == 'circumference':
                    prompt = (
                        "The student correctly identified the circumference. Congratulate them, define the circumference line as the distance around the outside of the circle and then ask them to select the radius."
                    )
                    request.session['current_part'] = 'radius'
                elif current_part == 'radius':
                    prompt = (
                        "The student correctly identified the radius. Congratulate them, define the radius line as the distance from the center point to a point on the circumference and then ask them to select the diameter."
                    )
                    request.session['current_part'] = 'diameter'
                elif current_part == 'diameter':
                    prompt = (
                        "The student correctly identified the diameter. Congratulate them, define the diameter line as the distance from one point on the circumference to another, passing through the center, and then conclude the lesson."
                    )
                    request.session.pop('current_part', None)
            else:
                if current_part == 'circumference':
                    prompt = (
                        "The student incorrectly identified the circumference. Kindly tell them they are incorrect, use a creative analogy to correct them and encourage them to try again by clicking on the circumference line."
                    )
                elif current_part == 'radius':
                    prompt = (
                        "The student incorrectly identified the radius. Kindly tell them they are incorrect, use a creative analogy to correct them and encourage them to try again by clicking on the radius line."
                    )
                elif current_part == 'diameter':
                    prompt = (
                        "The student incorrectly identified the diameter. Kindly tell them they are incorrect, use a creative analogy to correct them and encourage them to try again by clicking on the diameter line."
                    )

            response_text = generate_response(prompt)
            return generate_speech_response(response_text, speaker='AI', request=request)

        except json.JSONDecodeError as e:
            logger.error("Failed to decode JSON: %s", str(e))
            return JsonResponse({'error': 'Invalid JSON format'}, status=400)
        except Exception as e:
            logger.error("Unhandled exception: %s", str(e))
            return JsonResponse({'error': str(e)}, status=500)

    return JsonResponse({'error': 'Invalid request method'}, status=405)

@csrf_exempt
def stop_interaction(request):
    if request.method == 'POST':
        request.session.pop('current_part', None)
        request.session.pop('completed_parts', None)
        request.session.pop('last_interaction', None)
        request.session.pop('awaiting_ai_response', None)
        
        prompt = "The interaction has been stopped. If you wish to restart, click the start interaction button."
        return generate_speech_response(prompt, speaker='AI', request=request)
    
    return JsonResponse({'error': 'Invalid request method'}, status=405)

@csrf_exempt
def handle_user_query(request):
    if request.method == 'POST':
        try:
            data = json.loads(request.body)
            user_query = data.get('userQuery')
            speaker = data.get('speaker', 'User')

            if not user_query:
                logger.error("Missing userQuery in request.")
                return JsonResponse({'error': 'Missing userQuery'}, status=400)

            logger.debug("Received query from speaker: %s", speaker)
            logger.debug("Current awaiting_ai_response flag: %s", request.session.get('awaiting_ai_response'))

            if speaker == 'AI':
                if request.session.get('awaiting_ai_response'):
                    logger.info("Ignoring AI response to prevent feedback loop.")
                    request.session['awaiting_ai_response'] = False  # Reset flag to avoid feedback loop
                    return JsonResponse({'message': 'AI response ignored.'}, status=200)
                else:
                    logger.error("Unexpected AI response received when not awaiting an AI response.")
                    return JsonResponse({'error': 'Unexpected AI response.'}, status=400)

            # Generate AI response based on user query
            response_text = generate_response(user_query)
            request.session['awaiting_ai_response'] = True  # Set flag indicating waiting for an AI response

            logger.debug("Generating response for user query.")
            return generate_speech_response(response_text, speaker='AI', request=request)

        except json.JSONDecodeError as e:
            logger.error("Failed to decode JSON: %s", str(e))
            return JsonResponse({'error': 'Invalid JSON format'}, status=400)
        except Exception as e:
            logger.error("Unhandled exception: %s", str(e))
            return JsonResponse({'error': str(e)}, status=500)

    return JsonResponse({'error': 'Invalid request method'}, status=405)




@csrf_exempt
def handle_audio_diarization(request):
    if request.method == 'POST':
        try:
            data = json.loads(request.body)
            audio_content = data.get('audioContent')

            if not audio_content:
                return JsonResponse({'error': 'Missing audioContent'}, status=400)

            audio_content = base64.b64decode(audio_content)
            
            # Configure diarization settings
            audio = speech.RecognitionAudio(content=audio_content)
            diarization_config = speech.SpeakerDiarizationConfig(
                enable_speaker_diarization=True,
                min_speaker_count=2,
                max_speaker_count=10
            )
            config = speech.RecognitionConfig(
                encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
                sample_rate_hertz=16000,
                language_code='en-GB',
                diarization_config=diarization_config
            )

            response = speech_client.recognize(config=config, audio=audio)

            # Process the response to extract speaker diarization information
            results = []
            for result in response.results:
                for alternative in result.alternatives:
                    for word_info in alternative.words:
                        word = word_info.word
                        speaker_tag = word_info.speaker_tag
                        speaker = 'AI' if speaker_tag == 1 else 'User'  # Adjust based on actual speaker tag values
                        # Debug: Log speaker_tag and word
                        logger.debug(f"Word: {word}, Speaker Tag: {speaker_tag}")
                        results.append({
                            'word': word,
                            'speaker': speaker
                        })

            return JsonResponse({'transcript': results})

        except json.JSONDecodeError as e:
            logger.error("Failed to decode JSON: %s", str(e))
            return JsonResponse({'error': 'Invalid JSON format'}, status=400)
        except Exception as e:
            logger.error("Unhandled exception: %s", str(e))
            return JsonResponse({'error': str(e)}, status=500)
    
    return JsonResponse({'error': 'Invalid request method'}, status=405)

def generate_response(prompt):
    """Generate text response using OpenAI API with context."""
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful and concise math tutor."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=150,
        temperature=0.7
    )
    logger.debug("AI response: %s", response.choices[0].message['content'].strip())
    return response.choices[0].message['content'].strip()

def generate_speech_response(text, speaker=None, request=None):
    logger.debug("Generating speech response for text: %s", text)

    synthesis_input = texttospeech.SynthesisInput(text=text)
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-GB",
        name="en-GB-Standard-F"
    )
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )
    response = TTS_CLIENT.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )

    audio_content = response.audio_content
    if not isinstance(audio_content, bytes):
        logger.error("Audio content is not of type bytes.")
        return JsonResponse({'error': 'Audio content error'}, status=500)
    
    audio_base64 = base64.b64encode(audio_content).decode('utf-8')
    result = {'audioContent': audio_base64}
    
    if speaker:
        result['speaker'] = speaker

    if request:
        logger.debug("Resetting awaiting_ai_response flag.")
        request.session['awaiting_ai_response'] = False  # Reset flag after response is sent

    return JsonResponse(result)
`

I have shut off the response for the introductory text by manually ignoring it but I can't do the same for other AI responses because I can't control it.. The only way I have got it to work so far is by having a button which the user can click to speak but I want it to be a normal conversation.

Google text to speech and speech to text speaker diarization with AI

Answers (0)

Related Questions