import argparse
import time
import logging
import grpc  
import sys 
import os
import wave
from time import sleep
import urllib
import urllib.request
import base64
import json
from google.protobuf import text_format, json_format

from google.protobuf.json_format import MessageToJson, MessageToDict
 

from nuance.dlg.v1.common.dlg_common_messages_pb2 import *
from nuance.dlg.v1.dlg_messages_pb2 import *
from nuance.dlg.v1.dlg_interface_pb2 import *
from nuance.dlg.v1.dlg_interface_pb2_grpc import *

from nuance.tts.v1 import nuance_tts_v1_pb2
from nuance.asr.v1 import resource_pb2, result_pb2, recognizer_pb2, recognizer_pb2_grpc


oauth_token_expiry_threshhold_seconds = 30
oauth_token_expiry_seconds = 0
oauth_token = None
args = None

# Defines details for accepted command line arguments and help text
# Creates a global args object that contains the command line arguments
def parse_args():
    global args
    parser = argparse.ArgumentParser(
        prog="dlg_client.py",
        usage="%(prog)s [-options]",
        add_help=False,
        formatter_class=lambda prog: argparse.HelpFormatter(
            prog, max_help_position=45, width=100)
    )

    options = parser.add_argument_group("options")
    options.add_argument("-h", "--help", action="help",
                         help="Show this help message and exit")
    options.add_argument("--appId", metavar="appId", nargs="?", help="Mix appId. For self-hosted use only. Used by Dialog service to resolve resource URNs in self-hosted setup.")
    options.add_argument("--token", nargs="?", help=argparse.SUPPRESS)
    options.add_argument("--oauthURL", metavar="oauthUrl", nargs="?",
                         help="OAuth 2.0 URL")
    options.add_argument("--clientID", metavar="clientID", nargs="?",
                         help="OAuth 2.0 Client ID")
    options.add_argument("--clientSecret", metavar="clientSecret", nargs="?",
                         help="OAuth 2.0 Client Secret")
    options.add_argument("--oauthScope", metavar="oauthScope", nargs="?",
                         help="OAuth 2.0 Scope, default=dlg", default='dlg')
    options.add_argument("--secure", action="store_true",
                         help="Connect to the server using a secure gRPC channel")
    options.add_argument("-s", "--serverUrl", metavar="serverUrl", nargs="?",
                         help="Dialog server URL, default=localhost:8080", default='localhost:8080')
    options.add_argument('--modelUrn', metavar="modelUrn", nargs="?",
                         help="Dialog model URN, e.g. urn:nuance:mix/eng-USA/A2_C16/mix.dialog")
    options.add_argument("--textInput", metavar="textInput", nargs="?",
                         help="Text to perform interpretation on")
    options.add_argument("--audioFile", metavar="audioFile", nargs="?",
                         help="audio file name for speech input to trigger speech recognition and then interpretation")
    options.add_argument("--tts", help="Boolean whether to request TTS", action="store_true")
    options.add_argument("--audioDir", metavar="audio directory", nargs="?",
                         help="Audio output directory for TTS, default=audio. To be used together with --tts.", default='audio')

    args = parser.parse_args()

# Using clientID and clientSecret from the command line arguments, obtain an OAuth 2.0 token from the oauthURL HTTP endpoint
# Uses urllib.request to make the HTTP request
# Extracts and returns the access_token from the response, along with a Boolean called updated that indicates whether a new token was generated
def get_oauth2_token():
    global oauth_token
    global oauth_token_expiry_seconds
    global oauth_token_expiry_threshhold_seconds

    updated = False

    if args.oauthURL is None:
        return None
    
    current_time = time.monotonic()

    try:
        if oauth_token and oauth_token_expiry_seconds - oauth_token_expiry_threshhold_seconds > current_time:
            log.debug('OAuth token is still valid')
            return oauth_token, updated

        log.info("Obtaining auth token (Client ID: {}, URL: {})".format(args.clientID, args.oauthURL))

        encoded_credentials = base64.standard_b64encode("{}:{}".format(args.clientID, args.clientSecret).encode()).decode('utf-8')
        headers = { 'Authorization' : "Basic {}".format(encoded_credentials)  }

        data = {
            'grant_type': 'client_credentials',
            'scope': args.oauthScope,
        }

        request = urllib.request.Request(url=args.oauthURL, headers=headers, data=urllib.parse.urlencode(data).encode(), method='POST')
        updated = True
        with urllib.request.urlopen(request) as response:
            response = response.read().decode('utf-8')
            json_response = json.loads(response)

            oauth_token = json_response["access_token"]
            oauth_token_expiry_seconds = time.monotonic() + json_response["expires_in"]
        
            log.debug("Token TTL: %d" % json_response["expires_in"])
            return json_response["access_token"], updated
    except urllib.error.HTTPError as err:
        raise Exception("Failed to obtain authentication token. Status: {}, Error: {}".format(err.code, err.read().decode()))


# Creates a gRPC channel to the service
def create_channel(args):    
    call_credentials = None
    channel = None
    # Token passed in as a command line argument
    if args.token:
        log.debug('Adding CallCredentials using token parameter')
        call_credentials = grpc.access_token_call_credentials(args.token)
    else:
        # Request a token from the OAuth endpoint
        current_oauth_token, _ = get_oauth2_token()
        if current_oauth_token:
            log.debug('Adding CallCredentials from OAuth endpoint')
            call_credentials = grpc.access_token_call_credentials(current_oauth_token)
    # Secure channel. This is always used when contacting hosted Mix Dialog service. 
    # You need to pass in a Boolean command line argument --secure in this case
    if args.secure:
        log.debug("Creating secure gRPC channel")
        channel_credentials = grpc.ssl_channel_credentials()
        if call_credentials is not None:
            channel_credentials = grpc.composite_channel_credentials(channel_credentials, call_credentials)
        channel = grpc.secure_channel(args.serverUrl, credentials=channel_credentials)
    # Insecure channel. Not applicable to apps contacting Nuance-hosted Mix service. Can be used for self-hosted Mix Dialog service. 
    # You need to provide the Mix appId of your application as a command line argument --appId in this case.
    # In this case the sample app includes a custom header for Dialog passing in the appId as "x-nuance-client-id" 
    # Dialog uses this to resolve resource URNs in self-hosted setup.
    else:
        log.debug("Creating insecure gRPC channel")
        channel = grpc.insecure_channel(args.serverUrl, options = [("x-nuance-client-id", args.appId)])
    return channel

def read_session_id_from_response(response_obj):
    try:
        session_id = response_obj.get('payload').get('sessionId', None)
    except Exception as e:
        raise Exception("Invalid JSON Object or response object")
    if session_id:
        return session_id
    else:
        raise Exception("Session ID is not present or some error occurred")

# Generates the .wav file header for a given set of parameters. Auxiliary function for saving TTS output as a wav file.
def generate_wav_header(sample_rate, bits_per_sample, channels, datasize, formattype):
    # (4byte) Marks file as RIFF
    o = bytes("RIFF", 'ascii')
    # (4byte) File size in bytes excluding this and RIFF marker
    o += (datasize + 36).to_bytes(4, 'little')
    # (4byte) File type
    o += bytes("WAVE", 'ascii')
    # (4byte) Format Chunk Marker
    o += bytes("fmt ", 'ascii')
    # (4byte) Length of above format data
    o += (16).to_bytes(4, 'little')
    # (2byte) Format type (1 - PCM)
    o += (formattype).to_bytes(2, 'little')
    # (2byte) Will always be 1 for TTS
    o += (channels).to_bytes(2, 'little')
    # (4byte)
    o += (sample_rate).to_bytes(4, 'little')
    o += (sample_rate * channels * bits_per_sample // 8).to_bytes(4, 'little')  # (4byte)
    o += (channels * bits_per_sample // 8).to_bytes(2,'little')               # (2byte)
    # (2byte)
    o += (bits_per_sample).to_bytes(2, 'little')
    # (4byte) Data Chunk Marker
    o += bytes("data", 'ascii')
    # (4byte) Data size in bytes
    o += (datasize).to_bytes(4, 'little')
    return o

# Given bytearray() audio, and sampling details, saves as a .wav file, target_audio_file_name.
# audio - byte audio
# output_file_name - name of the intended output file name with extension 
# sample_rate - sample rate in Hz
# bits_per_sample - bits in each sample
# channels - number of channels
# formattype - format type, 1 for PCM
def save_audio_file_wav(audio, target_audio_file_name, sample_rate, bits_per_sample, channels, formattype):
    audio_file = ""
    output_file_path = os.path.join(args.audioDir, target_audio_file_name)
    os.makedirs(os.path.dirname(output_file_path), exist_ok=True)
    with open(output_file_path, "wb") as audio_file:
        datasize = len(audio)
        wav_header = generate_wav_header(sample_rate, bits_per_sample, channels, datasize, formattype)
        audio_file.seek(0, 0)
        audio_file.write(wav_header)
        audio_file.seek(0, 2)
        audio_file.write(audio)
    log.debug("Wrote generated speech audio response to %s" %  output_file_path)

def start_request(stub, model_ref_dict, session_id, selector_dict={}):
    selector = Selector(channel=selector_dict.get('channel'), 
                        library=selector_dict.get('library'),
                        language=selector_dict.get('language'))
    start_payload = StartRequestPayload(model_ref=model_ref_dict)
    start_req = StartRequest(session_id=session_id, 
                        selector=selector, 
                        payload=start_payload)
    log.debug(f'Start Request: {start_req}')
    start_response, call = stub.Start.with_call(start_req)
    response = MessageToDict(start_response)
    log.debug(f'Start Request Response: {response}')
    return response, call

def execute_request(stub, session_id, selector_dict={}, payload_dict={}):
    selector = Selector(channel=selector_dict.get('channel'), 
                        library=selector_dict.get('library'),
                        language=selector_dict.get('language'))
    input = UserInput(user_text=payload_dict.get('user_input').get('userText'))
    execute_payload = ExecuteRequestPayload(
                        user_input=input)
    execute_request = ExecuteRequest(session_id=session_id, 
                        selector=selector, 
                        payload=execute_payload)
    log.debug(f'Execute Request: {execute_payload}')
    execute_response, call = stub.Execute.with_call(execute_request)
    response = MessageToDict(execute_response)
    log.debug(f'Execute Response: {response}')
    return response, call


def execute_stream_request(args, stub, session_id, selector_dict={}, initial = False, interpret_text = False, request_asr = False, request_tts = False):
    # Receive stream outputs from Dialog, using stream of inputs
    stream_outputs = stub.ExecuteStream(build_stream_input(args, session_id, selector_dict, initial, interpret_text, request_asr, request_tts))
    responses = []
    audio = bytearray(b'')

    for stream_output in stream_outputs:
        if stream_output:
            if stream_output.HasField("response"):
                response = stream_output.response
                responses.append(response)
                # Extract execute response from the stream output
                response_dict = MessageToDict(stream_output.response)
                if response: 
                    responses.append(response)
                    log.debug(f'Received Execute response: {response_dict}')
            if stream_output.HasField('audio'):
                if stream_output.audio.HasField('audio'):
                    log.debug("Received TTS audio: %d bytes" % len(stream_output.audio.audio))
                    audio += stream_output.audio.audio
            if stream_output.HasField("asr_status"):
                asr_status = stream_output.asr_status
                log.debug("Received ASR status response: {} - {}".format(asr_status.code, asr_status.message))
            # if stream_output.HasField("asr_result"):
            #     asr_result = stream_output.asr_result
            #     log.debug("Received ASR result: {}".format(asr_result))
            
    return responses, audio

# Creates a stream of StreamInputs
def build_stream_input(args, session_id, selector_dict, initial = False, interpret_text = False, request_asr = False, request_tts = False):
    selector = Selector(channel = selector_dict.get('channel'),
                        library = selector_dict.get('library'),
                        language = selector_dict.get('language'))

    # Was TTS requested?
    if request_tts:
        # TTS requested
        # Settings for speech generation audio encoded as PCM 16KHz
        audio_format = nuance_tts_v1_pb2.AudioFormat(pcm = nuance_tts_v1_pb2.PCM(sample_rate_hz = 16000))
        audio_params = nuance_tts_v1_pb2.AudioParameters(audio_format = audio_format)
        voice = nuance_tts_v1_pb2.Voice(name = "Evan", model = "enhanced")
        # voice = nuance_tts_v1_pb2.Voice(name = "en-US-AmberNeural")
        # tts_control_v1 = TtsParamsV1(audio_params = audio_params)
        tts_control_v1 = TtsParamsV1(audio_params = audio_params, voice = voice)
    else:
        # No TTS needed
        tts_control_v1 = None 

    # Was text provided for interpretation?    
    if interpret_text:
        # Use text
        user_input = UserInput(user_text = args.textInput)
        execute_payload = ExecuteRequestPayload(user_input = user_input)
    else:
        if initial:
            # request flagged as initial request to kick off conversation and get initial prompts
            # Have to send an ExecuteRequestPayload with a user input, but with user_text empty
            user_input = UserInput(user_text = None)
            execute_payload = ExecuteRequestPayload(user_input = user_input)
        else:
            # Audio input case. Use empty payload
            execute_payload = ExecuteRequestPayload(user_input = None)
             
    # Build execute request object
    execute_request = ExecuteRequest(session_id = session_id,
                                     selector = selector,
                                     payload = execute_payload)
    
    # Audio file was provided. If so, open file, break it into packets, and stream
    if request_asr:
        with wave.open(args.audioFile, mode='r') as wf:
            # samples rate in Hz, samples per second
            sample_rate = wf.getframerate()
            # Desired time duration for each full audio packet, in seconds per packet. Using 0.02s per packet
            packet_duration = 0.020
            # number of samples for a packet, samples per second times seconds per packet
            packet_samples = int(sample_rate * packet_duration)
            audio_format = recognizer_pb2.AudioFormat(pcm = recognizer_pb2.PCM(sample_rate_hz = sample_rate))
            asr_control_v1 = AsrParamsV1(audio_format = audio_format, end_stream_no_valid_hypotheses = True)
            # first_packet flag distinguishes between first streaming packet and those that come after
            # For DLGaaS ExecuteStream(), first StreamInput contains audio config + first packet of audio bytes
            # Subsequent StreamInputs contain only audio bytes
            first_packet = True
            # the lambda reads a packet with packet_samples samples from the open audio file
            # iter creates an iterator that returns packets of the specified size. Using b'' as a sentinel value to stop
            log.debug(f'Streaming audio input...')
            for audio_packet in iter(lambda: wf.readframes(packet_samples), b''):
                if first_packet:
                    first_packet = False
                    # First packet includes the request header in addition to first chunk of audio bytes data
                    stream_input = StreamInput(
                        request = execute_request,
                        asr_control_v1 = asr_control_v1,
                        audio = audio_packet,
                        tts_control_v1 = tts_control_v1
                        )
                    log.debug(f'First streamed packet:')
                    log.debug(f'Sending parameters for ASR: {stream_input.asr_control_v1}')
                    log.debug(f'Sending parameters TTS: {stream_input.tts_control_v1}')
                    log.debug("Sending first speech input audio packet. Sending %d bytes" % len(audio_packet))
                else:
                    stream_input = StreamInput(audio = audio_packet)
                    # log.debug("Received audio: %d bytes" % len(stream_output.audio.audio))
                    log.debug("Sending subsequent speech audio packet. Sending %d bytes." % len(audio_packet))
                yield stream_input
                sleep(packet_duration)
            # Send a final empty StreamInput to signal to Dialog that the audio stream is complete
            stream_input = StreamInput(audio = b'')
            log.debug(f'Sending empty stream input to signal end of stream.')
            yield stream_input
    
    # Alternatively, no audio file provided. This branch handles the case of streaming with TTS only
    # Whether to kick off dialog or for first real turn of dialog
    else:
        stream_input = StreamInput(
            request = execute_request,
            tts_control_v1 = tts_control_v1
            )
        log.debug(f'Stream input with parameters for TTS: {stream_input.tts_control_v1}')
        yield stream_input
    
def stop_request(stub, session_id=None):
    stop_req = StopRequest(session_id=session_id)
    log.debug(f'Stop Request: {stop_req}')
    stop_response, call = stub.Stop.with_call(stop_req)
    response = MessageToDict(stop_response)
    log.debug(f'Stop Response: {response}')
    return response, call

def main():
    parse_args()
    log_level = logging.DEBUG
    global log
    log = logging.getLogger('')
    logging.basicConfig(
        format='%(asctime)s %(levelname)-5s: %(message)s', level=log_level)
    
    if args.oauthURL:
        if args.clientID is None:
            log.error("OAuth 2.0 URL was supplied but client ID is missing")
            return
        elif args.clientSecret is None:
            log.error("OAuth 2.0 URL was supplied but client secret is missing")
            return
    
    # Create channel to Dialog service
    with create_channel(args) as channel:
        stub = DialogServiceStub(channel)
        model_ref_dict = {
            "uri": args.modelUrn,
            "type": 0
        }
        selector_dict = {
            "channel": "default",
            "language": "en-US",
            "library": "default"
        }

        # Start the Dialog session
        response, call = start_request(stub, 
                            model_ref_dict=model_ref_dict, 
                            session_id = None,
                            selector_dict=selector_dict
                        )
        session_id = read_session_id_from_response(response)
        log.debug(f'Session: {session_id}')
        assert call.code() == grpc.StatusCode.OK
        log.debug(f'Initial request, no input from the user to get initial prompt')

        # Streaming required for ASR, TTS, or both
        if args.audioFile or args.tts:
            
            request_tts = args.tts
            if args.audioFile:
                interpret_text = False
                request_asr = True
            else:
                request_asr = False
                interpret_text = True
            # need to send initial request to kick off
            _, audio = execute_stream_request(args, stub, session_id, selector_dict = selector_dict, initial = True, request_tts = request_tts)
            if audio:
                 save_audio_file_wav(audio, "initial_tts_audio.wav", 16000, 16, 1, 1)
            else:
                log.debug(f'Something did not work with TTS initial prompts')

            # then send main request 
            _ , audio = execute_stream_request(args, stub, session_id, selector_dict = selector_dict, interpret_text = interpret_text, request_asr = request_asr, request_tts = request_tts)
            if audio:
                save_audio_file_wav(audio, "main_tts_audio.wav", 16000, 16, 1, 1)
            else:
                log.debug(f'Something did not work with TTS main response')

        # No streaming required
        else:
            payload_dict = {
                "user_input": {
                    "userText": None
                }
            }
            response, call = execute_request(stub, 
                                session_id=session_id, 
                                selector_dict=selector_dict,
                                payload_dict=payload_dict
                            )
            assert call.code() == grpc.StatusCode.OK
            log.debug(f'Second request, passing in user input')
            payload_dict = {
                "user_input": {
                    "userText": args.textInput
                }
            }
            response, call = execute_request(stub, 
                                session_id=session_id, 
                                selector_dict=selector_dict,
                                payload_dict=payload_dict
                            )
            assert call.code() == grpc.StatusCode.OK

if __name__ == '__main__':
    main()

