# * /************************************************************/ # * Author: # * Major: # * Creation Date: August 26, 2023 # * Due Date: Thursday, September 28, 11:59 PM via "make turnitin" # * Course: # * Professor Name: # * Assignment: 1 # * Filename: # * Purpose: Learn to use Python regular expressions. # * /************************************************************/ # STUDENT 1: Complete the above template 1% of assignment. # STUDENT: Leave all "STUDENT" comments intact in this file, # and follow all STUDENT instructions. Do NOT change my other code!!! # I will deduct at least 10% for each bug I fix, more if the N% value # below is greater than 10%. # https://pythex.org/ is useful for interactive testing. # https://docs.python.org/3/library/sys.html # https://docs.python.org/3/library/os.html # https://docs.python.org/3/library/csv.html # https://docs.python.org/3/library/gzip.html # https://docs.python.org/3/library/re.html # STUDENT BACKGROUND, not really needed for assignment: # https://notes.shichao.io/tcpv1/ch3/ For link layer (e.g., Ethernet) # https://notes.shichao.io/tcpv1/ch5/ For IP layer # https://notes.shichao.io/tcpv1/ch10/ For UDP within IP datagram # https://notes.shichao.io/tcpv1/ch12/ For TCP intro and header format import sys # Used to read command-line arguments in sys.argv import os # Used to test to avoid over-writing an output file. import csv # csv used to read & write comma-separated data files import gzip # gzip used to read a GNU compressed input file import re # used to match regular expressions to input lines # START OF PARSON DEMO RE PATTERNS *************************** # STUDENT: I am supplying example code to parse these lines in the input. # I am writing a CSV file for Ethernet link layer to sys.stdout CSV file # as a demo of how to go about this. You will do something similar with # IP and (TCP or UDP) datagram lines. # FROM THE INPUT FILE FORMAT, leading and trailing whitespace stripped. # No. Time Source Destination Protocol Length Info Heading_pattern_string = \ r'^No\.\s+Time\s+Source\s+Destination\s+Protocol\s+Length' Heading_pattern = re.compile(Heading_pattern_string) # 388 2.230401 172.16.42.4 cc-api-storage.adobe.io TLSv1.2 # capturing the frame number and time as a data fields, # this line occurs *immediately* after Heading_pattern in the input. Time_pattern_string = r'^(\d+)\s+(\d+\.\d+)\s+\S+\s+\S+\s+(\S+)' Time_pattern = re.compile(Time_pattern_string) # Frame 388: 97 bytes on wire ... # capturing/comparing the frame number and bytes as data fields. Frame_pattern_string = r'^Frame\s+(\d+):\s+(\d+)\s+bytes' Frame_pattern = re.compile(Frame_pattern_string) # Ethernet II, Src: ADDR1A (ADDR1B), Dst: ADDR2A (ADDR2B) # where ADDR1A and ADDR2A *may be* symbolic, and the B's are like this: # Ethernet II, Src: Apple_e1:fa:60 (3c:15:c2:e1:fa:60), Dst: 92:55:48:24:40:70 (92:55:48:24:40:70) # Note - values inside () are numeric, preceded by their symbolic values. # We need to record and report both, even when they are the same. Ethernet_pattern_string = \ r'^Ethernet II, Src:\s+(\S+)\s+\(([^)]+)\),\s+Dst:\s+(\S+)\s+\(([^)]+)\)' Ethernet_pattern = re.compile(Ethernet_pattern_string) # END OF PARSON DEMO RE PATTERNS *************************** # STUDENT 2: Create your patterns for these example lines here: # Worth 30% of this project. Do not hard code address numbers & strings. # See how I handle Ethernet addresses like Apple_e1:fa:60 or 3c:15:c2:e1:fa:60 # Internet Protocol Version 4, Src: 172.16.42.4 (172.16.42.4), Dst: cc-api-storage.adobe.io (52.206.174.72) # Internet Protocol Version 6, Src: fe80::486:2ea7:8814:e9d7 (fe80::486:2ea7:8814:e9d7), Dst: ff02::fb (ff02::fb) # Note - Match both Version 4 and 6. A datagram uses one or the other. # Note - values inside () are numeric, preceded by their symbolic values. # You need to record and report both, even when they are the same. # Transmission Control Protocol, Src Port: 49260 (49260), Dst Port: https (443), Seq: 1949, Ack: 641, Len: 31 # We use symbolic ports and (numeric ports) as extracted data. # We use Seq and Len numbers as data, skipping anything between them. # User Datagram Protocol, Src Port: mdns (5353), Dst Port: mdns (5353) # We use symbolic ports and (numeric ports) as extracted data. # There are no Seq and Len numbers in these UDP lines, make them '' in output. def main(infileName, datagramOutName, tcpStreamOutName): ''' main opens infileName as saved as a condensed text file from Wireshark and writes datagramOutName as a CSV output file. datagramOutName has one CSV line ("instance" or "record") per UDP or TCP datagram (1 per underlying Ethernet frame). tcpStreamOutName is a second CSV output file that tracks an enture TCP stream (connection) from the first to the final datagram. main() assumes caller has validated the file names. ''' infile = None if infileName.endswith('.txt'): infile = open(infileName, 'r') else: infile = gzip.open(infileName, 'rb') # binary .gz file demoEthernetWriter = csv.writer(sys.stdout, delimiter=',', quotechar='"') Ethfieldnames = ['inlinenum', 'frameNumber', 'timeStamp', 'frameBytes', 'frameProtocol', 'symsrc', 'numsrc', 'symdst', 'numdst'] demoEthernetWriter.writerow(Ethfieldnames) # sym for symbolic (when known), num for numeric. # STUDENT 3 10%: Open datagramOutName and tcpStreamOutName as CSV output # files similarly to above, 10% of this project. SEE: # ~parson/DataMine/TCPUDPout.csv.ref # ~parson/DataMine/TCPStreams.csv.ref justSawHeading = False frameNumber = None timeStamp = None frameBytes = None frameProtocol = None inline = infile.readline() lineno = 0 while inline: lineno += 1 if isinstance(inline,bytes): # convert byte sequence from a gzipped file into string inline = inline.decode() inline = inline.strip() try: matchobj = Heading_pattern.match(inline) if matchobj: # By handling the next match and then continuing back # up to "while inline:" within a "try:", the "finally:" # reads the next line for all match cases. justSawHeading = True # print("DEBUG Heading_pattern line: " + str(lineno)) continue matchobj = Time_pattern.match(inline) if matchobj: if not justSawHeading: sys.stderr.write("WARNING, Frame No. Time line " + str(lineno) + ":\n" + inline + "\nnot preceded by heading line.\n\n") justSawHeading = False # Frame No. Time line is most recent frameNumber = int(matchobj.group(1)) timeStamp = float(matchobj.group(2)) frameProtocol = matchobj.group(3) # print("DEBUG Time_pattern line:",lineno,frameNumber, timeStamp) continue justSawHeading = False # Something else. matchobj = Frame_pattern.match(inline) if matchobj: tmpframenum = int(matchobj.group(1)) frameBytes = int(matchobj.group(2)) if tmpframenum != frameNumber: sys.stderr.write("WARNING, Frame sequence number line " + str(lineno) + ":\n" + inline + "\ndoes not match preceding number: " + str(frameBytes) + '\n\n') frameNumber = tmpframenum # print("DEBUG Frame_pattern line:",lineno,frameNumber,frameBytes) continue matchobj = Ethernet_pattern.match(inline) if matchobj: demoEthernetWriter.writerow([lineno, frameNumber, "%.6f" % timeStamp, # format to 6 decimal places frameBytes, frameProtocol, matchobj.group(1), matchobj.group(2), matchobj.group(3), matchobj.group(4)]) # Keep frameNumber & timeStamp for IP and TCP|UDP output. continue # STUDENT 4: 10% : Match 'Internet Protocol' lines and save the # symbolic and numeric Src and Dst addresses in variables # of your choice. You need to combine these data with TCP & UDP. # STUDENT 5: 14% : Match 'Transmission Control Protocol' lines # & output an instance to your datagramOutName named CSV file, # matching my reference file. # STUDENT 6: 14% : Aggregate TCP stream data in # a Python dictionary (a.k.a. "map" or "hash table") # as follows: Concatenate two key strings, # SRCIP:SRCPORT and DSTIP:DSTPORT, then make two # composite keys SRCIP:SRCPORT-DSTIP:DSTPORT and # DSTIP:DSTPORT-SRCIP:SRCPORT. Check to see whether # either of these two *composite keys* is a key in # your dictionary DICT. # IF NOT IN DICT.keys(): # Initialize an instance list per fields in # ~parson/DataMine/TCPStreams.csv.ref, then # store it in your DICT[SRCIP:SRCPORT-DSTIP:DSTPORT] # ELSE # IF SRCIP:SRCPORT-DSTIP:DSTPORT IN DICT.keys(): # Update fields per # ~parson/DataMine/TCPStreams.csv.ref, # This datagram is from the client. # ELSE # Update fields per # ~parson/DataMine/TCPStreams.csv.ref, # (The server is replying to this client, i.e., # DSTIP:DSTPORT-SRCIP:SRCPORT IN DICT.keys()) # This datagram is from the server. # See ~parson/DataMine/TCPStreams.csv.ref # # STUDENT 7: 10% : Match 'User Datagram Protocol' lines and output # an instance to your datagramOutName named CSV file, # matching my reference file. finally: inline = infile.readline() # STUDENT 8 10%: After "while inline:" is done, do the folowing. # Write the header row as seen in ~parson/DataMine/TCPStreams.csv.ref # if you have not already. Then create a new sorted list of # instances by calling sorted() on the values() of the TCP stream # DICT. Then loop through this sorted list and WITHIN THE LOOP: # Format the first and second fields at [0] and [1], # which are the min and max timestamps, using the "%.6f" % # formatting operation used above for my Ethernet frame output. # # Then writerow this instance to your TCP Streams output per # ~parson/DataMine/TCPStreams.csv.ref sys.stdout.close() # STUDENT 9: Close your output files, 1% of project. sys.exit(0) # 0 means all is well __USAGE__ = \ "python3 csc523F23TCPUDP.py WiresharkInFile DatagramOutFile TCPstreamOutFile" if __name__ == '__main__': # Being run as the main from command line or makefile, i.e., not imported if len(sys.argv) != 4: msg = "ERROR, USAGE: " + __USAGE__ sys.stderr.write(msg + '\n') # raise ValueError(msg) sys.exit(1) # Non-0 exit status for a Unix error. msg = "" if os.path.exists(sys.argv[2]): msg = "ERROR: " + sys.argv[2] + " EXISTS, PLEASE REMOVE" sys.stderr.write(msg + '\n') if os.path.exists(sys.argv[3]): msg = "ERROR: " + sys.argv[3] + " EXISTS, PLEASE REMOVE" sys.stderr.write(msg + '\n') if not (sys.argv[1].endswith('.txt') or sys.argv[1].endswith('.gz')): msg = "ERROR, Input file must be a .txt or .gz text file: " \ + sys.argv[1] sys.stderr.write(msg + '\n') if not (sys.argv[2].endswith('.csv') and sys.argv[3].endswith('.csv')): msg = "ERROR, Output files must be .csv text files: " \ + sys.argv[2] + ',' + sys.argv[3] sys.stderr.write(msg + '\n') if sys.argv[2] == sys.argv[3]: msg = "ERROR, Output files must be .csv files with different names: " \ + sys.argv[2] + ',' + sys.argv[3] sys.stderr.write(msg + '\n') if msg: sys.exit(1) main(sys.argv[1], sys.argv[2], sys.argv[3])