RTF parsing + screenshotting capability added for further information

nekromoff · Jan 30, 2016 · fb1833d · fb1833d
1 parent 69fde93
commit fb1833d
Show file tree

Hide file tree

Showing 3 changed files with 297 additions and 0 deletions.
diff --git a/config.php.example b/config.php.example
@@ -20,6 +20,8 @@ $config["template"]='template.docx';
 $config["tempdir"]='tmp/';
 // CSV file with addresses to use for template, format: name|usedname|street|city|zip|email
 $config["addresses"]='example.csv';
+// CSV file with land to GPS mapping, format: parcela;long;lat
+$config["addresses"]='example-gps.csv';
 
 /** google drive details for file upload */
 // google drive client id - get it here (credentials): https://console.developers.google.com
@@ -33,4 +35,8 @@ $config["gdrive-folderid"]='';
 // google drive users to give "writer" permissions to
 $config["gdrive-users"]=array();
 
+/** website screenshot */
+// leave blank, if no screenshots are required, get it here: https://www.screenshotmachine.com/register.php?plan=free
+$config["screenshot-apikey"]='';
+
 ?>
diff --git a/rtf2text.php b/rtf2text.php
@@ -0,0 +1,148 @@
+<?php
+
+function rtf_isPlainText($s) {
+    $arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
+    for ($i = 0; $i < count($arrfailAt); $i++)
+        if (!empty($s[$arrfailAt[$i]])) return false;
+    return true;
+}
+
+function rtf2text($text) {
+    if (!strlen($text))
+        return "";
+
+    // Create empty stack array.
+    $document = "";
+    $stack = array();
+    $j = -1;
+    // Read the data character-by- character…
+    for ($i = 0, $len = strlen($text); $i < $len; $i++) {
+        $c = $text[$i];
+
+        // Depending on current character select the further actions.
+        switch ($c) {
+            // the most important key word backslash
+            case "\\":
+                // read next character
+                $nc = $text[$i + 1];
+
+                // If it is another backslash or nonbreaking space or hyphen,
+                // then the character is plain text and add it to the output stream.
+                if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
+                elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
+                elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
+                // If it is an asterisk mark, add it to the stack.
+                elseif ($nc == '*') $stack[$j]["*"] = true;
+                // If it is a single quote, read next two characters that are the hexadecimal notation
+                // of a character we should add to the output stream.
+                elseif ($nc == "'") {
+                    $hex = substr($text, $i + 2, 2);
+                    if (rtf_isPlainText($stack[$j]))
+                        $document .= html_entity_decode("&#".hexdec($hex).";");
+                    //Shift the pointer.
+                    $i += 2;
+                // Since, we’ve found the alphabetic character, the next characters are control word
+                // and, possibly, some digit parameter.
+                } elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
+                    $word = "";
+                    $param = null;
+
+                    // Start reading characters after the backslash.
+                    for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
+                        $nc = $text[$k];
+                        // If the current character is a letter and there were no digits before it,
+                        // then we’re still reading the control word. If there were digits, we should stop
+                        // since we reach the end of the control word.
+                        if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
+                            if (empty($param))
+                                $word .= $nc;
+                            else
+                                break;
+                        // If it is a digit, store the parameter.
+                        } elseif ($nc >= '0' && $nc <= '9')
+                            $param .= $nc;
+                        // Since minus sign may occur only before a digit parameter, check whether
+                        // $param is empty. Otherwise, we reach the end of the control word.
+                        elseif ($nc == '-') {
+                            if (empty($param))
+                                $param .= $nc;
+                            else
+                                break;
+                        } else
+                            break;
+                    }
+                    // Shift the pointer on the number of read characters.
+                    $i += $m - 1;
+
+                    // Start analyzing what we’ve read. We are interested mostly in control words.
+                    $toText = "";
+                    switch (strtolower($word)) {
+                        // If the control word is "u", then its parameter is the decimal notation of the
+                        // Unicode character that should be added to the output stream.
+                        // We need to check whether the stack contains \ucN control word. If it does,
+                        // we should remove the N characters from the output stream.
+                        case "u":
+                            //$toText .= html_entity_decode("&#x".dechex($param).";");
+                            $ucDelta = @$stack[$j]["uc"];
+                            if ($ucDelta > 0)
+                                $i += $ucDelta;
+                        break;
+                        // Select line feeds, spaces and tabs.
+                        case "par": case "page": case "column": case "line": case "lbr":
+                            $toText .= "\n";
+                        break;
+                        case "emspace": case "enspace": case "qmspace":
+                            $toText .= " ";
+                        break;
+                        case "tab": $toText .= "\t"; break;
+                        // Add current date and time instead of corresponding labels.
+                        case "chdate": $toText .= date("m.d.Y"); break;
+                        case "chdpl": $toText .= date("l, j F Y"); break;
+                        case "chdpa": $toText .= date("D, j M Y"); break;
+                        case "chtime": $toText .= date("H:i:s"); break;
+                        // Replace some reserved characters to their html analogs.
+                        case "emdash": $toText .= html_entity_decode("&mdash;"); break;
+                        case "endash": $toText .= html_entity_decode("&ndash;"); break;
+                        case "bullet": $toText .= html_entity_decode("&#149;"); break;
+                        case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
+                        case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
+                        case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
+                        case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
+                        // Add all other to the control words stack. If a control word
+                        // does not include parameters, set &param to true.
+                        default:
+                            $stack[$j][strtolower($word)] = empty($param) ? true : $param;
+                        break;
+                    }
+                    // Add data to the output stream if required.
+                    if (rtf_isPlainText($stack[$j]))
+                        $document .= $toText;
+                }
+
+                $i++;
+            break;
+            // If we read the opening brace {, then new subgroup starts and we add
+            // new array stack element and write the data from previous stack element to it.
+            case "{":
+                array_push($stack, $stack[$j++]);
+            break;
+            // If we read the closing brace }, then we reach the end of subgroup and should remove
+            // the last stack element.
+            case "}":
+                array_pop($stack);
+                $j--;
+            break;
+            // Skip “trash”.
+            case '\0': case '\r': case '\f': case '\n': break;
+            // Add other data to the output stream if required.
+            default:
+                if (rtf_isPlainText($stack[$j]))
+                    $document .= $c;
+            break;
+        }
+    }
+    // Return result.
+    return $document;
+}
+
+?>
diff --git a/scrape.php b/scrape.php
@@ -5,9 +5,12 @@
 </head>
 <body>
 <?php
+ini_set('max_execution_time', 300);
 require('config.php');
 require('simple_html_dom.php');
+require('rtf2text.php');
 require('PHPMailer/PHPMailerAutoload.php');
+$ch=curl_init();
 $hashes=file('lasthash.txt');
 foreach ($hashes as $key=>$hash)
    {
@@ -54,6 +57,7 @@
 $i=0;
 foreach($html->find('tr') as $line)
    {
+   $rtfcontent="";
    $project=$line->find('td',0)->plaintext;
    $link=$line->getElementByTagName('a')->href;
    $city=$line->find('td',2)->plaintext;
@@ -72,6 +76,129 @@
       $projects[$i]["institution"]=trim($htmlchild->find('.table-list li',2)->getElementByTagName('span')->plaintext);
       $projects[$i]["proponent"]=trim($htmlchild->find('.table-list li',3)->getElementByTagName('span')->plaintext);
       $projects[$i]["proponentidnumber"]=trim($htmlchild->find('.table-list li',4)->getElementByTagName('span')->plaintext);
+      // if screenshot API key exists, make lots of additional effort
+      if ($config["screenshot-apikey"])
+         {
+         $content=$htmlchild->find('text');
+         foreach ($content as $text)
+            {
+            if (strpos($text,'Oznámenie o predložení zámeru:')!==FALSE)
+               {
+               $infofile=$text->parent->parent->find('li',0)->find('a',0)->href;
+
+               curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
+               curl_setopt ($ch, CURLOPT_URL, 'https://www.enviroportal.sk'.$infofile);
+               curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
+               curl_setopt($ch, CURLOPT_HEADER, 1);
+               curl_setopt($ch, CURLOPT_NOBODY, 1);
+               $content=curl_exec ($ch);
+               $contenttype=curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
+               if ($contenttype=='application/rtf')
+                  {
+                  $rtfcontent=file_get_contents('https://www.enviroportal.sk'.$infofile);
+                  break;
+                  }
+               elseif ($contenttype=='application/zip')
+                  {
+                  copy('https://www.enviroportal.sk'.$infofile,'tmpfile.zip');
+                  $zip=new ZipArchive;
+                  $zip->open('tmpfile.zip');
+                  $zip->extractTo($config["tempdir"],array($zip->getNameIndex(0)));
+                  $rtfcontent=file_get_contents($config["tempdir"].$zip->getNameIndex(0));
+                  unlink($config["tempdir"].$zip->getNameIndex(0));
+                  $zip->close();
+                  break;
+                  }
+               }
+            }
+         // basic info RTF exists, extract exact location and pull a screenshot
+         if ($rtfcontent)
+            {
+            $content=rtf2text($rtfcontent);
+            $content=str_replace('?','',$content);
+            $content=preg_replace_callback("/(&#[0-9]+;)/", function($m) { return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); }, $content);
+            $content=preg_replace('{(.)\1+}u','$1',$content);
+            $parsedtext=preg_split("/\r?\n/",$content);
+            foreach ($parsedtext as $key=>$line)
+               {
+               $parsedtext[$key]=trim($line);
+               }
+            foreach ($parsedtext as $key=>$line)
+               {
+               if (!$line) unset($parsedtext[$key]);
+               }
+            $parsedtext=array_values($parsedtext);
+            foreach ($parsedtext as $key=>$line)
+               {
+               if (stripos($line,'Miesto real')!==FALSE OR stripos($line,'Mesto real')!==FALSE)
+                  {
+                  break;
+                  }
+               }
+            /*
+            if (!$key) echo 'key not found';
+            else echo 'key: ',$key,'<br />';
+            */
+            $location="";
+            // loop over the lines following "Miesto realizácie" until the end to find land registry "parcela" number
+            for ($i=$key+1;$i<=count($parsedtext);$i++)
+               {
+               if (strpos($parsedtext[$i],'parc.')!==FALSE OR strpos($parsedtext[$i],'p.')!==FALSE OR strpos($parsedtext[$i],'parcel')!==FALSE OR preg_match('/[0-9]{3,}\/[0-9]{1,}/',$parsedtext[$i],$matches)===1)
+                  {
+                  if (preg_match('/[0-9]{3,}\/[0-9]{1,}/',$parsedtext[$i],$matches)===1)
+                     {
+                     $location=$matches[0];
+                     $location=trim($location);
+                     break;
+                     }
+                  }
+               }
+            // loop over the lines following "Miesto realizácie" until the end to identify existing location
+            if (!$location)
+               {
+               $ch=curl_init();
+               curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
+               curl_setopt($ch, CURLOPT_HEADER, 0);
+               curl_setopt($ch, CURLOPT_NOBODY, 0);
+               for ($i=$key+1;$i<=count($parsedtext);$i++)
+                  {
+                  $location=trim($parsedtext[$i]);
+                  //geocoding
+                  curl_setopt($ch, CURLOPT_URL, 'https://nominatim.openstreetmap.org/search?q='.urlencode($location).'&format=json');
+                  $geocode=curl_exec($ch);
+                  $json=json_decode($geocode);
+                  // enforce slovakia, otherwise skip
+                  if (isset($json[0]->lon) AND isset($json[0]->lat) AND (stripos($json[0]->display_name,'Slovakia')!==FALSE OR stripos($json[0]->display_name,'Slovensko')!==FALSE))
+                     {
+                     $long=$json[0]->lon;
+                     $lat=$json[0]->lat;
+                     break;
+                     }
+                  }
+               }
+            /*
+            if (!$long AND !$lat) echo('location not identified');
+            echo $location;
+            */
+            // find coords based on land registry "parcela" number, if not found by geocoding
+            if ($location AND !$long AND !$lat)
+               {
+               $csv=file($config["landgps"]);
+               foreach ($csv as $line)
+                  {
+                  if (strpos($line,$location)!==FALSE)
+                     {
+                     $parts=explode(';',$line);
+                     $long=$parts[1];
+                     $lat=$parts[2];
+                     break;
+                     }
+                  }
+               }
+            $projects[$i]["long"]=$long;
+            $projects[$i]["lat"]=$lat;
+            }
+         }
       $i++;
       }
    }
@@ -97,6 +224,17 @@
       $mail->Body.=$project["institution"]."\n";
       $mail->Body.="http://www.enviroportal.sk".$project["link"]."\n";
       $mail->Subject.=" ".$project["name"];
+      // include screenshot in the email
+      if ($config["screenshot-apikey"] AND isset($project["long"]) AND isset($project["lat"]))
+         {
+         $mapimage=imagecreatefrompng('http://api.screenshotmachine.com/?key='.$config["screenshot-apikey"].'&size=F&format=PNG&cacheLimit=0&timeout=1000&url=http%3A%2F%2Flabs.strava.com%2Fheatmap%2F%2315%2F'.$long.'%2F'.$lat.'%2Fblue%2Fbike');
+         $mapimagenew=imagecreatetruecolor(579,708);
+         imagecopy($mapimagenew,$mapimage,0,0,445,60,1024,708);
+         imagepng($mapimagenew,$config['tempdir'].$hash.'.png');
+         $mail->AddAttachment($config['tempdir'].$hash.'.png');
+         imagedestroy($mapimagenew); imagedestroy($mapimage);
+         }
+      // if templating enabled, generate document
       if ($config["template"] AND !$existingprojectstate)
          {
          unset($institutionkey);
@@ -167,6 +305,11 @@
          {
          echo ', email '.$mail->Subject.' has been sent';
          }
+      if ($config["screenshot-apikey"] AND isset($project["long"]) AND isset($project["lat"]))
+         {
+         unlink($config["tempdir"].$filename);
+         }
+      $mail->clearAttachments();
       echo '<br />';
       }
    }