#!/usr/bin/python import os import pandas as pd from bs4 import BeautifulSoup import datetime import sys, getopt from dateutil.relativedelta import relativedelta from stat import S_IREAD ######################## ## ARGUMENTS ## #if not len(sys.argv)==3: # print("Need Vacancy Seniority.pdf and Master Seniority.html files as inputs!") # sys.exit("Incorrent Arguments. " + sys.argv[0] + " aborted.") #else: # VacancyFile=sys.argv[1] # MasterSenFile=sys.argv[2] ######################## ##Script will attempt to read two Seniority Lists and merge them for data analysis ##Script will capture errors and throw excetpions ##Script will save result as a csv for archiving ##Script will delete original lists upon successful creation of merged csv ####################### ##LOGGING## verbosity = 2 #0-silent, 1-debug, 2-verbose def log(text, level=2): #takes text as argument, assumes logging level is verbose unless set) if level <= verbosity: print(text) ###################### ####################### #Read PDF seniority list (domicile Seniority from SWALIFE) #Formerly pdf.py def parse_vacsen(senfile): log("Starting parse_vacsen with file " + senfile,1) if os.path.exists(senfile): log("Executing pdftotext on " + str(senfile), 2) os.system('pdftotext -raw ' + senfile + ' test.txt') log("COMPLETE!", 2) else: raise Exception("File " + str(senfile) + " not found") log("Opening test.txt and initializing lists and DataFrames", 2) myfile = open('test.txt', 'r') listPilots = [] #initiallize list to pass to DataFrame #initialize DataFrame dfPilots = pd.DataFrame(columns=('ID', 'Fname', 'Lname', 'DomSen', 'Dom', 'Etops', 'Seat', 'MstrSen', 'vacsendate', 'combined')) log("COMPLETE!", 2) for line in myfile: dictPilotdata = {'ID' : '', 'Fname' : '', 'Lname' : '', 'Dom' : '', 'Seat' : '', 'Etops' : '', 'DomSen' : '', 'MstrSen' : '', 'vacsendate' : '', 'combined':''} if line.startswith("VACANCY"): combined = "/" in line # Flag entries from OAK CA / OAK CA E and OAK FO / OAK FO E lists (duplicates) if not line.strip()[-1]=="E": domicile=line.strip()[-6:-3] # strip the Domicile seat=line.strip()[-2:] etops=0 else: domicile=line[-9:-6] #if ETOPS, strip domicile this way seat=line[-5:-2] etops=1 if line.startswith("Effective"): reportdate=line.strip()[16:23] #Strip the report generation date reportdate=reportdate.strip() if not line.startswith(("Base", "BG", "Sen", "Page", "Effective", "VACANCY", "vacancy")): holder=line.split() #derive data from text file to space delimited list if len(holder)>1: #For lists with name data... pilot=holder #populate list _pilot_ pilot=pilot[:4] #strip odd initials or **INTL designationscat if len(pilot)<4: pilot.append("X") #Fix for single name names (no first name) elif len(holder) == 1 and holder[0] != "E": #For lists with one item (dom sen)... pilot.append(holder[0]) #Add to _pilot_ pilot.append(domicile) #Add _domicile_ to _pilot_ pilot.append(seat) #Add _seat_ to _pilot_ pilot.append(etops) pilot.append(combined) dictPilotdata['ID'] = pilot[1] dictPilotdata['Fname'] = pilot[3] dictPilotdata['Lname'] = pilot[2] dictPilotdata['Dom'] = pilot[5] dictPilotdata['Seat'] = pilot[6].strip() dictPilotdata['Etops'] = pilot[7] dictPilotdata['DomSen'] = pilot[0] dictPilotdata['MstrSen'] = pilot[4] dictPilotdata['vacsendate'] = reportdate dictPilotdata['combined'] = pilot[8] listPilots.append(dictPilotdata) myfile.close() log("Removing text.txt", 2) os.remove("test.txt") dfPilots = dfPilots.append(listPilots) # dfPilots = dfPilots.drop_duplicates(['ID'], keep='first', inplace=True) # test=dfPilots.drop_duplicates() # try: # test.to_csv(r'pdf.csv') # except: # log("Could not save parse_vacsen DataFrame as pdf.csv", 0) return dfPilots log("parse_vacsen success", 1) ################################### ################################### #Open html Seniority list from CWA and parse #Formerly sen.py def parse_mstrsen(mstrsenfile): log("Starting parse_mstrsen", 2) print("Starting parse_mstrsen with file: " + mstrsenfile) def my_date(strdate): mydate = datetime.datetime.strptime(strdate,'%d%b%y') #Parsed dates from senlist have 2 digit years. Parsed date assume wrong century if mydate > datetime.datetime.now(): #If parsed date has wrong century (is later than today): mydate = mydate - relativedelta(years=100) # subtract 100 years from date return mydate f = open(mstrsenfile) soup = BeautifulSoup(f, 'html.parser') f.close() #Get all tables with class detailListTable table_list = soup.findAll('table', attrs={'class':'detailListTable'}) count = 0 #Initialize list which will pass data to DataFrame listPilots = [] #Initialize DataDrame dfPilots = pd.DataFrame(columns=('ID', 'Name', 'TotalSen', 'DOH', 'CAdt', 'DOB')) #Iterate through each individual table for table in table_list: for row in table.findAll('tr'): #Iterate through Rows dictPilotdata = {"ID":'', "Name": '', "TotalSen":'', "DOH":'', "CAdt": '', "DOB":''} #Initialize pilot attributes dictionary count = 0 # Reset Counter for col in row.find_all('td', class_=['centerAlign', 'leftAlign']): #Iterate through relavent TDs #Place data in dictionary based on counter if count == 0: dictPilotdata["TotalSen"] = str(col.text) elif count == 1: dictPilotdata["Name"] = str(col.text).strip() elif count == 3: dictPilotdata["ID"] = str(col.text) elif count == 4: dictPilotdata["DOH"] = my_date(str(col.text)) elif count == 5 and not str(col.text) == '-': dictPilotdata["CAdt"] = my_date(str(col.text)) elif count == 6: dictPilotdata["DOB"] = my_date(str(col.text)) count+=1 #increment counter for next pilot data row #Add new dictionary entry with Key EmpNum and attributes dictionary as value if dictPilotdata['ID']: #Ignore blank rows listPilots.append(dictPilotdata) dfPilots = dfPilots.append(listPilots) # dfPilots.to_csv(r'sen.csv') return dfPilots log("parse_mstrsen SUCCESS",1) ########################################################### ########################################################## ###Merge csv files into a new file, and delete pre-merged files #Formerly merge.py def mergesen(df_domsen, df_vacsen, build_csv=True): log("Starting mergesen", 1) df_merged = pd.merge(df_domsen, df_vacsen, on='ID', how='inner') reportdate = df_merged.iloc[0]['vacsendate'] filename = reportdate + '_merged_senlist.csv' # df_merged = df_merged[df_merged.combined != True] df_merged = df_merged.drop_duplicates(subset='ID', keep='first', inplace=False) if build_csv: log("Building merged.csv", 2) df_merged.to_csv(filename) try: #change permissions on file os.chmod(filename, 0o550) log("Successfully changed " + filename + " to read-only", 2) except: log("Could not modify permissions to read only for file " + filename, 2) log("mergesen SUCCESS!", 1) return df_merged