PHP Script for Converting RPTL 1590 Reports into Excel Files

Real Property Tax Law 1590 requires that municipalities post their tax rolls, within 10 days of the proposed and final rolls being approved. Below is an PHP script that will extract the reports into a CSV file for importing into Microsoft Excel or a GIS program. It extracts the text from the PDF using pdftotext from the poppler-util.

If you do not want to install poppler-util, I would encourage to check out the simpler and better maintained R Script for for Converting RPTL 1590 Reports that I also wrote. Both versions can also be found on my GitHub.

001<?php
002 
003// this program requires pdftotext (a linux program) and PHP version 7.2
004 
005// first convert PDF to text
006$pdfdir = "input-pdf";
007$textdir = "output-txt";
008 
009// delete old input-text
010if (isset($argv[1]) && $argv[1] == 'delete') {
011    echo "Deleting old conversions ...\n";
012    system("rm $textdir/*");
013}
014 
015foreach (scandir($pdfdir) as $file) {
016    if (substr($file, -4) !== '.pdf') {
017        continue;
018    }
019     
020    $textfile = substr($file, 0, -4).".txt";
021    $town = substr($file, 0, -4);
022     
023    echo ("#### START $town #### \n");
024     
025    if (file_exists("$textdir/$textfile")) {
026        echo "Text file exists, not converting PDF again (arg[1] == delete to override).\n";
027    }
028    else {
029        echo "Converting to text file ...";
030        system('pdftotext -layout '.escapeshellarg("$pdfdir/$file").' '.escapeshellarg("$textdir/$textfile"));
031        echo " DONE\n";
032    }
033     
034    $text = file("$textdir/$textfile");
035    $town = substr($file, 0, -4);
036 
037    $taxroll = array();
038    $payerId = 0;
039     
040    $output = "";
041     
042    $townId = "";
043    $swisId = "";
044    $countyId = "";
045    $villageId = "";
046     
047    for ($i = 0; $i < count($text); $i++) {
048        if ($i % 100 == 0) echo "#";
049         
050        // capture county - town - swis
051        if (preg_match('/COUNTY\s*?- (.*?)\s{2}/', $text[$i], $matches)) $countyId = $matches[1];
052        if (preg_match('/CITY\s*?- (.*?)\s{2}/', $text[$i], $matches)) $townId = $matches[1];
053        if (preg_match('/TOWN\s*?- (.*?)\s{2}/', $text[$i], $matches)) $townId = $matches[1];
054        if (preg_match('/VILLAGE\s*?- (.*?)\s{2}/', $text[$i], $matches)) $villageId = $matches[1];
055        if (preg_match('/SWIS\s*?- (.*?)\s{2}/', $text[$i], $matches)) $swisId = $matches[1];
056 
057         
058        // first line = tax id
059        $pattern = '/\*{3,} ((\d|\-|\.){4,}) \*{3,}/';
060        preg_match($pattern, $text[$i], $matches);
061 
062        // we've found the start of a new tax record!
063        if (isset($matches[1])) {
064            $i++;
065             
066            $taxpayer = array();
067            $j = 0;
068            // output each part onto the line
069            while (isset($text[$i]) && !preg_match('/\*{3,}/', $text[$i])) {
070                $split = preg_split('/\s{2,}/', $text[$i]);
071                 
072                $taxpayer[$j] = $split;
073                $i++; $j++;
074            }
075             
076             
077            $taxpayer[$j] = array('location',$countyId, $townId, $villageId, $swisId);
078                         
079            $taxroll[$payerId++] = $taxpayer;
080            $i--;
081        }
082    }
083 
084    // export unprocess tax rolls for debug
085    file_put_contents("output-debug/$town.txt", print_r($taxroll,true));
086     
087    // next scan for all special district types in file
088    $specialDistType = array();
089     
090    foreach ($taxroll as $taxpayer) {
091        for ($i = 0; $i < count($taxpayer); $i++) {
092                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
093                    if (preg_match('/^([A-Z]{2})(\d\d\d) (.*?)( TO|$|\d{2,})/', $taxpayer[$i][$j],$matches)) {
094                        $specialDistType[$matches[1]] = $matches[1];                       
095                    }
096                }  
097        }
098    }
099    ksort($specialDistType);
100 
101    // then process into a nice field
102    $formTax = array();
103 
104    foreach ($taxroll as $taxpayer) {
105        $formPayer = array();
106         
107        $formPayer[0] = $taxpayer[1][0]; // tax id
108         
109        if (isset($taxpayer[0][1]) && preg_match('/^(\d.*?) (.*?)$/',$taxpayer[0][1], $address)) {
110            $formPayer[1] = $address[1]; // street number
111            $formPayer[2] = ucwords(strtolower($address[2])); // street name
112        }
113        elseif (isset($taxpayer[0][1]))  {
114            $formPayer[1] = '';
115            $formPayer[2] = ucwords(strtolower($taxpayer[0][1])); // street name
116        }
117         
118        if (isset($formPayer[1])) $formPayer[23] = ltrim($formPayer[1].' '.$formPayer[2]); // full street
119        else if (isset($formPayer[1])) $formPayer[23] = ltrim($formPayer[2]);
120         
121        $formPayer[3] = ucwords(strtolower($taxpayer[2][0])); // owner 1
122         
123        // next five lines are either are owner or address info
124        for ($i = 3; $i < 8; $i++) {
125             
126            if (!isset($taxpayer[$i][0])) continue;
127             
128            // if a taxpayer name
129            if (preg_match('/^[A-Z]/',$taxpayer[$i][0]) && !preg_match('/^PO/',$taxpayer[$i][0]) && !preg_match('/^(.*?), (\w\w) (.*?)$/',$taxpayer[$i][0]))    {
130                 
131                if (!isset($formPayer[4])) $formPayer[4] = ucwords(strtolower($taxpayer[$i][0]));
132                else if (!isset($formPayer[5])) $formPayer[5] = ucwords(strtolower($taxpayer[$i][0]));
133                else if (!isset($formPayer[6])) $formPayer[6] = ucwords(strtolower($taxpayer[$i][0]));
134            }
135             
136            // if a city - state - zip
137            else if (preg_match('/^(.*?), (\w\w) (.*?)$/',$taxpayer[$i][0], $address)) {
138                $formPayer[10] = ucwords(strtolower($address[1]));
139                $formPayer[11] = strtoupper($address[2]);
140                $formPayer[12] = ucwords(strtolower($address[3]));
141            }
142             
143            // if an address (pad to this field)
144            else if (preg_match('/^\d/',$taxpayer[$i][0]) || preg_match('/^PO/',$taxpayer[$i][0])) {
145                if (!isset($formPayer[7])) $formPayer[7] =  ucwords(strtolower($taxpayer[$i][0]));
146                else if (!isset($formPayer[8])) $formPayer[8] =  ucwords(strtolower($taxpayer[$i][0]));
147                else if (!isset($formPayer[9])) $formPayer[9] =  ucwords(strtolower($taxpayer[$i][0]));
148            }
149         
150        $formPayer[13] = $taxpayer[1][1];
151    }
152         
153        // extract coordinates by searching through array
154        for ($i = 0; $i < count($taxpayer); $i++) {
155            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
156                if (preg_match('/EAST-(\d*) NRTH-(\d*)/', $taxpayer[$i][$j], $coord)) {
157                    $formPayer[14] = $coord[1];
158                    $formPayer[15] = $coord[2];    
159                }
160            }
161        }
162         
163        // extract acres
164         
165            for ($i = 0; $i < count($taxpayer); $i++) {
166            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
167                if (preg_match('/ACRES *?(\d+)/', $taxpayer[$i][$j],$acres)) {
168                    $formPayer[16] = $acres[1];
169                }
170                else if (preg_match('/ACRES/', $taxpayer[$i][$j])) {
171                    if (preg_match('/^([0-9.]+)/', $taxpayer[$i][$j+1], $acres)) $formPayer[16] = $acres[1];
172                }
173            }
174        }
175 
176    // extract full market value
177 
178            for ($i = 0; $i < count($taxpayer); $i++) {
179            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
180                if (preg_match('/FULL MARKET VALUE *?(\d+)/', $taxpayer[$i][$j],$value)) {
181                    $formPayer[17] = str_replace(',','',$value[1]);
182                }
183                else if (preg_match('/FULL MARKET VALUE/', $taxpayer[$i][$j])) {
184                    if (preg_match('/^([0-9,]+)/', $taxpayer[$i][$j+1], $value)) $formPayer[17] = str_replace(',','',$value[1]);
185                }
186            }
187        }
188         
189        // extract deed book info
190            for ($i = 0; $i < count($taxpayer); $i++) {
191                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
192                     
193                                         
194                    if (preg_match('/DEED BOOK *?(\d+) *?PG-(\d+)/', $taxpayer[$i][$j],$value)) {
195                        $formPayer[18] = $value[1];
196                        $formPayer[19] = $value[2];
197                    }
198                    else if (preg_match('/DEED BOOK *?(\d+)/', $taxpayer[$i][$j],$value)) {
199                        $formPayer[18] = $value[1];
200                        if (isset($taxpayer[$i][$j+1]) && preg_match('/^PG-(\d+)/', $taxpayer[$i][$j+1], $value)) $formPayer[19] = $value[1];
201                    }
202                }
203            }
204                 
205            // county taxable amount
206            for ($i = 0; $i < count($taxpayer); $i++) {
207                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
208                    if (preg_match('/COUNTY TAXABLE VALUE/', $taxpayer[$i][$j])) $formPayer[20] = chop(str_replace(',','',$taxpayer[$i][$j+1]));
209                }
210            }
211 
212        // school taxable amount
213            for ($i = 0; $i < count($taxpayer); $i++) {
214                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
215                    if (preg_match('/SCHOOL TAXABLE VALUE/', $taxpayer[$i][$j])) $formPayer[21] = chop(str_replace(',','',$taxpayer[$i][$j+1]));
216                }
217            }  
218        // city taxable amount
219            for ($i = 0; $i < count($taxpayer); $i++) {
220                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
221                    if (isset($taxpayer[$i][$j]) && preg_match('/^(CITY|TOWN)/', $taxpayer[$i][$j])) {
222                        if (isset($taxpayer[$i][$j+1]) && preg_match('/^TAXABLE VALUE/', $taxpayer[$i][$j+1])) $formPayer[22] =  chop(str_replace(',','',$taxpayer[$i][$j+2]));
223                         
224                    }
225                }  
226            }
227     
228         
229        // field relating to solar power (for munis that have such laws)
230        $formPayer[24] = '';
231        for ($i = 0; $i < count($taxpayer); $i++) {
232            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
233                if (preg_match('/solar/i', $taxpayer[$i][$j])) {
234                    $formPayer[24] .= "{$taxpayer[$i][$j]},";
235                }
236            }  
237        }  
238         
239        // STAR
240        $formPayer[25] = '';
241        for ($i = 0; $i < count($taxpayer); $i++) {
242            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
243                if (preg_match('/ STAR/', $taxpayer[$i][$j])) {
244                    $formPayer[25] .= "{$taxpayer[$i][$j]},";
245                }
246            }  
247        }
248         
249        // STAR
250        $formPayer[26] = '';
251        for ($i = 0; $i < count($taxpayer); $i++) {
252            for ($j = 0; $j < count($taxpayer[$i]); $j++) {
253                if (preg_match('/(VET WAR|CW_15_VET|VETWAR|VETDIS|VETERANS)/', $taxpayer[$i][$j])) {
254                    $formPayer[26] .= "{$taxpayer[$i][$j]},";
255                }
256            }  
257        }  
258         
259        // SCHOOL
260        $formPayer[27] = $taxpayer[2][1];  
261         
262        // columns 28+ are special districts
263        $l = 28;
264         
265        foreach ($specialDistType as $type) {  
266            $formPayer[$l] = '';
267                 
268            for ($i = 0; $i < count($taxpayer); $i++) {
269                for ($j = 0; $j < count($taxpayer[$i]); $j++) {
270                    if (isset($taxpayer[$i][$j]) && preg_match('/^(\w\w)(\d\d\d) (.*?)( TO|$|\d{2,})/', $taxpayer[$i][$j],$matches)) {
271                        if ($matches[1] == $type) $formPayer[$l] .= "{$matches[1]}{$matches[2]} {$matches[3]} ";
272                    }
273                }  
274            }
275             
276            $l++;
277        }
278         
279         
280        // sort and add missing keys
281        for ($i = 0; $i < count($formPayer); $i++) {
282            if (!isset($formPayer[$i])) $formPayer[$i] = '';
283        }
284         
285         
286        ksort($formPayer);
287         
288                // shift onto the rolls county, town, village, swis
289        for ($i = 0; $i < count($taxpayer); $i++) {
290                 
291                if ($taxpayer[$i][0] != 'location') continue;
292                 
293                // add array to line               
294                for ($j = count($taxpayer[$i])-1; $j > 0; $j--) array_unshift($formPayer, $taxpayer[$i][$j]);
295                 
296        }
297         
298         
299        $formTax[] = $formPayer;
300         
301        }
302 
303 
304        // lastly sort form by street and number
305         
306        $addNum = array();
307        $addSt = array();
308        $own1 = array();
309        for ($i = 0; $i < count($formTax); $i++) {
310          $addSt[] = $formTax[$i][6];
311          $addNum[] = $formTax[$i][5];
312          $own1[] =  $formTax[$i][7];
313        }
314 
315        // now apply sort
316        array_multisort($addSt, SORT_ASC,
317                $addNum, SORT_NUMERIC, SORT_ASC,
318                $own1, SORT_ASC,
319                $formTax);
320                 
321                 
322    //print_r($formTax);
323 
324    echo "\nWriting to CSV ...";
325 
326    // print out form
327    $output .=  '"Tax Roll","County","Town","Village","SWIS","Tax ID","Street Number","Street Name","Owner 1","Owner 2","Owner 3","Owner 4",'
328                .'"Mail Address 1","Mail Address 2","Mail Address 3","Mail City","Mail State","Mail Zip",'
329                .'"Property Type","East","North","Acres","Full Market Value","Deed Book","Deed Pg",'
330                .'"County Value","School Value","Town Value","Full Street",'
331                .'"Solar","STAR","VETS","School",';
332                 
333    foreach ($specialDistType as $type) {
334        $output .= "\"$type\",";
335    }
336                 
337    $output .=  "\n";
338 
339    foreach ($formTax as $line) {
340        $output .=  '"'.$town.'",';
341        foreach ($line as $item) {
342            $output .=  '"'.$item.'",';
343        }
344         
345        $output .=  "\n";
346    }
347     
348    // save output to file
349    file_put_contents("output-csv/$town.csv", $output);
350     
351    echo " DONE\n";
352}
353 
354// last, create a great big file
355//system("cat output-csv/*.csv > all-property.csv");
356 
357system("zip output-csv.zip output-csv/*");

2 Comments

  • Brad Stratton says:

    This is great! I wish I had found this 3 years ago. I was wondering if it’s possible to add in the additional Ag District info – ie. β€œMAY BE SUBJECT TO PAYMENT UNDER AGDIST LAW TIL 2024.” It didn’t transfer over in the test I did.

  • Marcus Catlin says:

    Hi Andy,

    I stumbled upon your site to convert the assessment rolls and find it fascinating – all towns should be using this to make their data more accessable.

    I have been able to get the R code to work but for whatever reason it is cutting off the first digit of the sbl – Not sure if it is something on my end but I wanted to let you know

1 Trackback or Pingback

Leave a Reply

Your email address will not be published. Required fields are marked *