Below is the code I am using to generate the crunched numbers for Albany County, for example my 2021 Albany County Election Spreadsheet. Text is converted using pdftotext -layout 2021GeneralRecanvass.txt which converts the PDF to a structured text file for easier parsing. The Excel Writer library is used for extensive formatting of the file for easy reading and parsing. View this as a separate page for syntax highlighting.
001 | from io import StringIO |
015 | electionResultText = '/home/andy/Documents/GIS.Data/election.districts/2019' |
016 | enrollmentXLS = '/home/andy/enroll/nov19-Enrollment/AlbanyED_nov19.xlsx' |
017 | outputPath = '/tmp/2018_albany_county_races.xlsx' |
024 | return chr ( ord ( '@' ) + math.floor(colNum / 26 )) + chr ( ord ( '@' ) + colNum % 26 ) |
026 | return chr ( ord ( '@' ) + colNum) |
028 | pd.set_option( "display.max_rows" , None ) |
029 | pd.set_option( "display.max_columns" , None ) |
031 | with open (electionResultText) as f: |
034 | enroll = pd.read_excel(enrollmentXLS, header = 4 ) |
035 | enroll = enroll[(enroll[ 'STATUS' ] = = 'Active' )][[ 'ELECTION DIST' , 'TOTAL' ]].dropna() |
036 | enroll[ 'ELECTION DIST' ] = enroll[ 'ELECTION DIST' ].astype( str ). str .replace( ' ' , ' ' ) |
037 | enroll[ 'ELECTION DIST' ] = enroll[ 'ELECTION DIST' ]. str .strip() |
038 | enroll[ 'Enrollment' ] = enroll[ 'TOTAL' ] |
039 | enroll.drop(labels = [ 'TOTAL' ], axis = 1 , inplace = True ) |
045 | for raceData in re.split( '={10,}' ,data): |
051 | rows = raceData.split( '\n' ) |
057 | for i, line in enumerate (rows): |
059 | if re.search( 'VOTES\s*?PERCENT' , line): |
062 | race = race.replace( ' ' , ' ' ) |
067 | for result in re.findall( '(\d\d)\s*?=\s*?(\w.*?)\d' , line): |
068 | candidates[ int (result[ 0 ])] = result[ 1 ].rstrip() |
071 | if re.findall( '-{5,}' , line): |
076 | if not candidates or not 2 in candidates: |
088 | sep = '(?<=\d)\s{1,}(?=\d)' , |
092 | df = df[df[ 0 ]. str .contains( '^\d{4}' ).fillna( False )] |
094 | df.reset_index(drop = True , inplace = True ) |
096 | df = df.rename(candidates,axis = 1 ) |
098 | df.iloc[:, 1 :] = df.iloc[:, 1 :]. apply (pd.to_numeric) |
099 | df.iloc[:, 1 :] = df.iloc[:, 1 :].convert_dtypes( 'int32' ) |
102 | df.insert( 1 , 'Ballot' , df.iloc[:, 1 :].convert_dtypes( 'int32' ). sum (axis = 1 )) |
104 | df.insert( 2 , 'Blanks' , df[ 'OVER VOTES' ].convert_dtypes( 'int32' )) |
105 | df.drop(labels = [ 'OVER VOTES' ], axis = 1 , inplace = True ) |
107 | df[ 'Blanks' ] + = df[ 'UNDER VOTES' ].convert_dtypes( 'int32' ) |
108 | df.drop(labels = [ 'UNDER VOTES' ], axis = 1 , inplace = True ) |
110 | df.insert( 3 , 'Canvas' , '=F' + (df.index + 2 ). map ( str ) + '-G' + (df.index + 2 ). map ( str )) |
111 | df.insert( 4 , 'TO %' , '=(F' + (df.index + 2 ). map ( str ) + '/E' + (df.index + 2 ). map ( str ) + ')' ) |
112 | df.insert( 5 , 'DO %' , '=(G' + (df.index + 2 ). map ( str ) + '/F' + (df.index + 2 ). map ( str ) + ')' ) |
115 | df[ 'CHECK' ] = '=F' + (df.index + 2 ). map ( str ) + '-G' + (df.index + 2 ). map ( str ) |
119 | for i, col in enumerate (df.columns[ 6 : - 2 ]): |
122 | if ( len (df.columns) - i - 2 ) > 0 : |
124 | df[ 'CHECK' ] + = '-' + letter(i + i + 11 ) + (df.index + 2 ). map ( str ) |
127 | df[ 'CHECK %' ] + = '+' + letter(i + i + 12 ) + (df.index + 2 ). map ( str ) |
130 | df.insert(i + i + 7 , col + ' %' , '=(' + letter(i + i + 11 ) + (df.index + 2 ). map ( str ) + '/H' + (df.index + 2 ). map ( str ) + ')' ) |
135 | df[[ 'ED Code' , 'Municipality' , 'String' , 'Ward' , 'ED' ]] = df[ 0 ]. str .extract( '(\d\d\d\d)\s*?(.*?)(\s*?WARD\s*(\d*))?\s*?ED\s*(\d*)' ) |
138 | df[ 'ELECTION DIST' ] = df[ 'Municipality' ]. str .strip().replace( ' ' , ' ' ) + \ |
139 | ' ' + df[ 'Ward' ].fillna( 0 ).astype( str ). str .zfill( 3 ) + df[ 'ED' ]. str .zfill( 3 ).astype( 'str' ) |
142 | df = df.merge(enroll, on = 'ELECTION DIST' , how = 'left' ) |
143 | df = df.drop( 0 , axis = 1 ) |
144 | df = df.drop(labels = [ 'ELECTION DIST' , 'String' ],axis = 1 ) |
147 | cols = list (df.columns) |
148 | df = df[cols[ - 5 :] + cols[: - 5 ]] |
150 | df[ 'Municipality' ] = df[ 'Municipality' ]. str .title() |
151 | df[ 'Municipality' ] = df[ 'Municipality' ]. str .strip().replace( ' ' , ' ' ) |
154 | er[race] = df.fillna( 0 ) |
157 | ew = pd.ExcelWriter(outputPath) |
159 | for i, race in enumerate (er): |
160 | raceStr = str (i + 1 ) + ' ' + race |
162 | raceStr = str (i + 1 ) + ' ' + race[: 10 ] + '...' + race[ - 10 :] |
165 | er[race].fillna( 0 ).to_excel(ew,sheet_name = raceStr, index = False ) |
167 | headForm = ew.book.add_format({ 'text_wrap' : 1 , 'font_family' : 'Arial' , 'bold' : True , |
168 | 'valign' : 'vcenter' , 'align' : 'center' , 'bg_color' : '#CCCCCC' }) |
169 | for colnum, value in enumerate (er[race].columns.values): |
170 | ew.sheets[raceStr].write( 0 , colnum, value, headForm) |
172 | ew.sheets[raceStr].set_row( 0 , 50 ) |
175 | bodyForm = ew.book.add_format({ 'text_wrap' : 1 , 'font_family' : 'Arial' , 'num_format' : '#,##0' , 'valign' : 'vcenter' , 'align' : 'right' }) |
176 | bodyPerForm = ew.book.add_format({ 'text_wrap' : 1 , 'font_family' : 'Arial' , 'num_format' : '0.0%' , 'valign' : 'vcenter' , 'align' : 'right' }) |
177 | muniForm = ew.book.add_format({ 'text_wrap' : 1 , 'font_family' : 'Arial' , 'valign' : 'vcenter' , 'align' : 'center' }) |
178 | ew.sheets[raceStr].set_column( 'A1:A9999' , 6 , bodyForm) |
179 | ew.sheets[raceStr].set_column( 'B1:B9999' , 15 , muniForm) |
180 | ew.sheets[raceStr].set_column( 'C1:D9999' , 4 , bodyForm) |
181 | ew.sheets[raceStr].set_column( 'E1:H9999' , 10 , bodyForm) |
182 | ew.sheets[raceStr].set_column( 'I1:J9999' , 10 , bodyPerForm) |
184 | for colNum in range ( 11 , 40 ): |
186 | ew.sheets[raceStr].set_column(letter(colNum) + '1:' + letter(colNum) + '9999' , 12 , bodyForm) |
188 | ew.sheets[raceStr].set_column(letter(colNum) + '1:' + letter(colNum) + '9999' , 12 , bodyPerForm) |
190 | ew.sheets[raceStr].freeze_panes( 'E2' ) |