# * /************************************************************/
# * Author:             
# * Major:              
# * Creation Date:      August 26, 2023
# * Due Date:           Thursday, September 28, 11:59 PM via "make turnitin"
# * Course:             
# * Professor Name:     
# * Assignment:         1
# * Filename:           
# * Purpose:            Learn to use Python regular expressions.
# * /************************************************************/
# STUDENT 1: Complete the above template 1% of assignment.

# STUDENT: Leave all "STUDENT" comments intact in this file,
# and follow all STUDENT instructions. Do NOT change my other code!!!
# I will deduct at least 10% for each bug I fix, more if the N% value
# below is greater than 10%.

# https://pythex.org/ is useful for interactive testing.
# https://docs.python.org/3/library/sys.html
# https://docs.python.org/3/library/os.html
# https://docs.python.org/3/library/csv.html
# https://docs.python.org/3/library/gzip.html
# https://docs.python.org/3/library/re.html
# STUDENT BACKGROUND, not really needed for assignment:
# https://notes.shichao.io/tcpv1/ch3/       For link layer (e.g., Ethernet)
# https://notes.shichao.io/tcpv1/ch5/       For IP layer
# https://notes.shichao.io/tcpv1/ch10/      For UDP within IP datagram
# https://notes.shichao.io/tcpv1/ch12/      For TCP intro and header format

import sys              # Used to read command-line arguments in sys.argv
import os               # Used to test to avoid over-writing an output file.
import csv              # csv used to read & write comma-separated data files
import gzip             # gzip used to read a GNU compressed input file
import re               # used to match regular expressions to input lines

# START OF PARSON DEMO RE PATTERNS ***************************
# STUDENT: I am supplying example code to parse these lines in the input.
# I am writing a CSV file for Ethernet link layer to sys.stdout CSV file
# as a demo of how to go about this. You will do something similar with
# IP and (TCP or UDP) datagram lines.

# FROM THE INPUT FILE FORMAT, leading and trailing whitespace stripped.

# No.     Time           Source                Destination           Protocol Length Info
Heading_pattern_string =                                            \
    r'^No\.\s+Time\s+Source\s+Destination\s+Protocol\s+Length'
Heading_pattern = re.compile(Heading_pattern_string)

# 388 2.230401       172.16.42.4           cc-api-storage.adobe.io TLSv1.2
#       capturing the frame number and time as a data fields,
#       this line occurs *immediately* after Heading_pattern in the input.
Time_pattern_string = r'^(\d+)\s+(\d+\.\d+)\s+\S+\s+\S+\s+(\S+)'
Time_pattern = re.compile(Time_pattern_string)

# Frame 388: 97 bytes on wire ...
#       capturing/comparing the frame number and bytes as data fields.
Frame_pattern_string = r'^Frame\s+(\d+):\s+(\d+)\s+bytes'
Frame_pattern = re.compile(Frame_pattern_string)

# Ethernet II, Src: ADDR1A (ADDR1B), Dst: ADDR2A (ADDR2B)
#       where ADDR1A and ADDR2A *may be* symbolic, and the B's are like this:
# Ethernet II, Src: Apple_e1:fa:60 (3c:15:c2:e1:fa:60), Dst: 92:55:48:24:40:70 (92:55:48:24:40:70)
#   Note - values inside () are numeric, preceded by their symbolic values.
#   We need to record and report both, even when they are the same.

Ethernet_pattern_string = \
r'^Ethernet II, Src:\s+(\S+)\s+\(([^)]+)\),\s+Dst:\s+(\S+)\s+\(([^)]+)\)'
Ethernet_pattern = re.compile(Ethernet_pattern_string)
# END OF PARSON DEMO RE PATTERNS ***************************

# STUDENT 2: Create your patterns for these example lines here:
# Worth 30% of this project. Do not hard code address numbers & strings.
# See how I handle Ethernet addresses like Apple_e1:fa:60 or 3c:15:c2:e1:fa:60

# Internet Protocol Version 4, Src: 172.16.42.4 (172.16.42.4), Dst: cc-api-storage.adobe.io (52.206.174.72)
# Internet Protocol Version 6, Src: fe80::486:2ea7:8814:e9d7 (fe80::486:2ea7:8814:e9d7), Dst: ff02::fb (ff02::fb)
#   Note - Match both Version 4 and 6. A datagram uses one or the other.
#   Note - values inside () are numeric, preceded by their symbolic values.
#   You need to record and report both, even when they are the same.

# Transmission Control Protocol, Src Port: 49260 (49260), Dst Port: https (443), Seq: 1949, Ack: 641, Len: 31
# We use symbolic ports and (numeric ports) as extracted data.
# We use Seq and Len numbers as data, skipping anything between them.

# User Datagram Protocol, Src Port: mdns (5353), Dst Port: mdns (5353)
# We use symbolic ports and (numeric ports) as extracted data.
# There are no Seq and Len numbers in these UDP lines, make them '' in output.

def main(infileName, datagramOutName, tcpStreamOutName):
    '''
    main opens infileName as saved as a condensed text file from Wireshark
    and writes datagramOutName as a CSV output file.
    datagramOutName has one CSV line ("instance" or "record") per
    UDP or TCP datagram (1 per underlying Ethernet frame).
    tcpStreamOutName is a second CSV output file that tracks an
    enture TCP stream (connection) from the first to the final datagram.
    main() assumes caller has validated the file names.
    '''
    infile = None
    if infileName.endswith('.txt'):
        infile = open(infileName, 'r')
    else:
        infile = gzip.open(infileName, 'rb')        # binary .gz file

    demoEthernetWriter = csv.writer(sys.stdout, delimiter=',', quotechar='"')
    Ethfieldnames = ['inlinenum', 'frameNumber', 'timeStamp', 'frameBytes',
        'frameProtocol', 'symsrc', 'numsrc', 'symdst', 'numdst']
    demoEthernetWriter.writerow(Ethfieldnames)
    # sym for symbolic (when known), num for numeric.

    # STUDENT 3 10%: Open datagramOutName and tcpStreamOutName as CSV output
    # files similarly to above, 10% of this project. SEE:
    # ~parson/DataMine/TCPUDPout.csv.ref
    # ~parson/DataMine/TCPStreams.csv.ref

    justSawHeading = False
    frameNumber = None
    timeStamp = None
    frameBytes = None
    frameProtocol = None

    inline = infile.readline()
    lineno = 0
    while inline:
        lineno += 1
        if isinstance(inline,bytes):
            # convert byte sequence from a gzipped file into string
            inline = inline.decode()
        inline = inline.strip()
        try:
            matchobj = Heading_pattern.match(inline)
            if matchobj:
                # By handling the next match and then continuing back
                # up to "while inline:" within a "try:", the "finally:"
                # reads the next line for all match cases.
                justSawHeading = True
                # print("DEBUG Heading_pattern line: " + str(lineno))
                continue
            matchobj = Time_pattern.match(inline)
            if matchobj:
                if not justSawHeading:
                    sys.stderr.write("WARNING, Frame No. Time line "
                        + str(lineno) + ":\n" + inline
                        + "\nnot preceded by heading line.\n\n")
                justSawHeading = False # Frame No. Time line is most recent
                frameNumber = int(matchobj.group(1))
                timeStamp = float(matchobj.group(2))
                frameProtocol = matchobj.group(3)
                # print("DEBUG Time_pattern line:",lineno,frameNumber, timeStamp)
                continue
            justSawHeading = False  # Something else.
            matchobj = Frame_pattern.match(inline)
            if matchobj:
                tmpframenum = int(matchobj.group(1))
                frameBytes = int(matchobj.group(2))
                if tmpframenum != frameNumber:
                    sys.stderr.write("WARNING, Frame sequence number line "
                        + str(lineno) + ":\n" + inline
                        + "\ndoes not match preceding number: "
                        + str(frameBytes) + '\n\n')
                    frameNumber = tmpframenum
                # print("DEBUG Frame_pattern line:",lineno,frameNumber,frameBytes)
                continue
            matchobj = Ethernet_pattern.match(inline)
            if matchobj:
                demoEthernetWriter.writerow([lineno, frameNumber,
                    "%.6f" % timeStamp, # format to 6 decimal places
                    frameBytes, frameProtocol, matchobj.group(1),
                    matchobj.group(2), matchobj.group(3), matchobj.group(4)])
                # Keep frameNumber & timeStamp for IP and TCP|UDP output.
                continue

            # STUDENT 4: 10% : Match 'Internet Protocol' lines and save the
            # symbolic and numeric Src and Dst addresses in variables
            # of your choice. You need to combine these data with TCP & UDP.

            # STUDENT 5: 14% : Match 'Transmission Control Protocol' lines
            # & output an instance to your datagramOutName named CSV file,
            # matching my reference file.

                # STUDENT 6: 14% : Aggregate TCP stream data in
                # a Python dictionary (a.k.a. "map" or "hash table")
                # as follows: Concatenate two key strings,
                # SRCIP:SRCPORT and DSTIP:DSTPORT, then make two
                # composite keys SRCIP:SRCPORT-DSTIP:DSTPORT and
                # DSTIP:DSTPORT-SRCIP:SRCPORT. Check to see whether
                # either of these two *composite keys* is a key in
                # your dictionary DICT.
                # IF NOT IN DICT.keys():
                #   Initialize an instance list per fields in
                #   ~parson/DataMine/TCPStreams.csv.ref, then
                #   store it in your DICT[SRCIP:SRCPORT-DSTIP:DSTPORT]
                # ELSE
                #       IF SRCIP:SRCPORT-DSTIP:DSTPORT IN DICT.keys():
                #           Update fields per
                #           ~parson/DataMine/TCPStreams.csv.ref,
                #           This datagram is from the client.
                #       ELSE
                #           Update fields per
                #           ~parson/DataMine/TCPStreams.csv.ref,
                #           (The server is replying to this client, i.e.,
                #           DSTIP:DSTPORT-SRCIP:SRCPORT IN DICT.keys())
                #           This datagram is from the server.
                #   See ~parson/DataMine/TCPStreams.csv.ref
                #

            # STUDENT 7: 10% : Match 'User Datagram Protocol' lines and output
            # an instance to your datagramOutName named CSV file,
            # matching my reference file.
            
        finally:
            inline = infile.readline()

    # STUDENT 8 10%: After "while inline:" is done, do the folowing.
    # Write the header row as seen in ~parson/DataMine/TCPStreams.csv.ref
    # if you have not already. Then create a new sorted list of
    # instances by calling sorted() on the values() of the TCP stream
    # DICT. Then loop through this sorted list and WITHIN THE LOOP:
    #       Format the first and second fields at [0] and [1],
    #       which are the min and max timestamps, using the "%.6f" %
    #       formatting operation used above for my Ethernet frame output.
    #
    #       Then writerow this instance to your TCP Streams output per
    #       ~parson/DataMine/TCPStreams.csv.ref


    sys.stdout.close()
    
    # STUDENT 9: Close your output files, 1% of project.
    sys.exit(0) # 0 means all is well

__USAGE__ =                                                             \
"python3 csc523F23TCPUDP.py WiresharkInFile DatagramOutFile TCPstreamOutFile"
if __name__ == '__main__':
    # Being run as the main from command line or makefile, i.e., not imported
    if len(sys.argv) != 4:
        msg = "ERROR, USAGE: " + __USAGE__
        sys.stderr.write(msg + '\n')
        # raise ValueError(msg)
        sys.exit(1)     # Non-0 exit status for a Unix error.
    msg = ""
    if os.path.exists(sys.argv[2]):
        msg = "ERROR: " + sys.argv[2] + " EXISTS, PLEASE REMOVE"
        sys.stderr.write(msg + '\n')
    if os.path.exists(sys.argv[3]):
        msg = "ERROR: " + sys.argv[3] + " EXISTS, PLEASE REMOVE"
        sys.stderr.write(msg + '\n')
    if not (sys.argv[1].endswith('.txt') or sys.argv[1].endswith('.gz')):
        msg = "ERROR, Input file must be a .txt or .gz text file: " \
            + sys.argv[1]
        sys.stderr.write(msg + '\n')
    if not (sys.argv[2].endswith('.csv') and sys.argv[3].endswith('.csv')):
        msg = "ERROR, Output files must be .csv text files: "   \
            + sys.argv[2] + ',' + sys.argv[3]
        sys.stderr.write(msg + '\n')
    if sys.argv[2] == sys.argv[3]:
        msg = "ERROR, Output files must be .csv files with different names: " \
            + sys.argv[2] + ',' + sys.argv[3]
        sys.stderr.write(msg + '\n')
    if msg:
        sys.exit(1)
    main(sys.argv[1], sys.argv[2], sys.argv[3])