Skip to content

Commit

Permalink
RTF parsing + screenshotting capability added for further information
Browse files Browse the repository at this point in the history
  • Loading branch information
nekromoff committed Jan 30, 2016
1 parent 69fde93 commit fb1833d
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 0 deletions.
6 changes: 6 additions & 0 deletions config.php.example
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ $config["template"]='template.docx';
$config["tempdir"]='tmp/';
// CSV file with addresses to use for template, format: name|usedname|street|city|zip|email
$config["addresses"]='example.csv';
// CSV file with land to GPS mapping, format: parcela;long;lat
$config["addresses"]='example-gps.csv';

/** google drive details for file upload */
// google drive client id - get it here (credentials): https://console.developers.google.com
Expand All @@ -33,4 +35,8 @@ $config["gdrive-folderid"]='';
// google drive users to give "writer" permissions to
$config["gdrive-users"]=array();

/** website screenshot */
// leave blank, if no screenshots are required, get it here: https://www.screenshotmachine.com/register.php?plan=free
$config["screenshot-apikey"]='';

?>
148 changes: 148 additions & 0 deletions rtf2text.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
<?php

function rtf_isPlainText($s) {
$arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata");
for ($i = 0; $i < count($arrfailAt); $i++)
if (!empty($s[$arrfailAt[$i]])) return false;
return true;
}

function rtf2text($text) {
if (!strlen($text))
return "";

// Create empty stack array.
$document = "";
$stack = array();
$j = -1;
// Read the data character-by- character…
for ($i = 0, $len = strlen($text); $i < $len; $i++) {
$c = $text[$i];

// Depending on current character select the further actions.
switch ($c) {
// the most important key word backslash
case "\\":
// read next character
$nc = $text[$i + 1];

// If it is another backslash or nonbreaking space or hyphen,
// then the character is plain text and add it to the output stream.
if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\';
elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' ';
elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-';
// If it is an asterisk mark, add it to the stack.
elseif ($nc == '*') $stack[$j]["*"] = true;
// If it is a single quote, read next two characters that are the hexadecimal notation
// of a character we should add to the output stream.
elseif ($nc == "'") {
$hex = substr($text, $i + 2, 2);
if (rtf_isPlainText($stack[$j]))
$document .= html_entity_decode("&#".hexdec($hex).";");
//Shift the pointer.
$i += 2;
// Since, we’ve found the alphabetic character, the next characters are control word
// and, possibly, some digit parameter.
} elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
$word = "";
$param = null;

// Start reading characters after the backslash.
for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) {
$nc = $text[$k];
// If the current character is a letter and there were no digits before it,
// then we’re still reading the control word. If there were digits, we should stop
// since we reach the end of the control word.
if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') {
if (empty($param))
$word .= $nc;
else
break;
// If it is a digit, store the parameter.
} elseif ($nc >= '0' && $nc <= '9')
$param .= $nc;
// Since minus sign may occur only before a digit parameter, check whether
// $param is empty. Otherwise, we reach the end of the control word.
elseif ($nc == '-') {
if (empty($param))
$param .= $nc;
else
break;
} else
break;
}
// Shift the pointer on the number of read characters.
$i += $m - 1;

// Start analyzing what we’ve read. We are interested mostly in control words.
$toText = "";
switch (strtolower($word)) {
// If the control word is "u", then its parameter is the decimal notation of the
// Unicode character that should be added to the output stream.
// We need to check whether the stack contains \ucN control word. If it does,
// we should remove the N characters from the output stream.
case "u":
//$toText .= html_entity_decode("&#x".dechex($param).";");
$ucDelta = @$stack[$j]["uc"];
if ($ucDelta > 0)
$i += $ucDelta;
break;
// Select line feeds, spaces and tabs.
case "par": case "page": case "column": case "line": case "lbr":
$toText .= "\n";
break;
case "emspace": case "enspace": case "qmspace":
$toText .= " ";
break;
case "tab": $toText .= "\t"; break;
// Add current date and time instead of corresponding labels.
case "chdate": $toText .= date("m.d.Y"); break;
case "chdpl": $toText .= date("l, j F Y"); break;
case "chdpa": $toText .= date("D, j M Y"); break;
case "chtime": $toText .= date("H:i:s"); break;
// Replace some reserved characters to their html analogs.
case "emdash": $toText .= html_entity_decode("&mdash;"); break;
case "endash": $toText .= html_entity_decode("&ndash;"); break;
case "bullet": $toText .= html_entity_decode("&#149;"); break;
case "lquote": $toText .= html_entity_decode("&lsquo;"); break;
case "rquote": $toText .= html_entity_decode("&rsquo;"); break;
case "ldblquote": $toText .= html_entity_decode("&laquo;"); break;
case "rdblquote": $toText .= html_entity_decode("&raquo;"); break;
// Add all other to the control words stack. If a control word
// does not include parameters, set &param to true.
default:
$stack[$j][strtolower($word)] = empty($param) ? true : $param;
break;
}
// Add data to the output stream if required.
if (rtf_isPlainText($stack[$j]))
$document .= $toText;
}

$i++;
break;
// If we read the opening brace {, then new subgroup starts and we add
// new array stack element and write the data from previous stack element to it.
case "{":
array_push($stack, $stack[$j++]);
break;
// If we read the closing brace }, then we reach the end of subgroup and should remove
// the last stack element.
case "}":
array_pop($stack);
$j--;
break;
// Skip “trash”.
case '\0': case '\r': case '\f': case '\n': break;
// Add other data to the output stream if required.
default:
if (rtf_isPlainText($stack[$j]))
$document .= $c;
break;
}
}
// Return result.
return $document;
}

?>
143 changes: 143 additions & 0 deletions scrape.php
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
</head>
<body>
<?php
ini_set('max_execution_time', 300);
require('config.php');
require('simple_html_dom.php');
require('rtf2text.php');
require('PHPMailer/PHPMailerAutoload.php');
$ch=curl_init();
$hashes=file('lasthash.txt');
foreach ($hashes as $key=>$hash)
{
Expand Down Expand Up @@ -54,6 +57,7 @@
$i=0;
foreach($html->find('tr') as $line)
{
$rtfcontent="";
$project=$line->find('td',0)->plaintext;
$link=$line->getElementByTagName('a')->href;
$city=$line->find('td',2)->plaintext;
Expand All @@ -72,6 +76,129 @@
$projects[$i]["institution"]=trim($htmlchild->find('.table-list li',2)->getElementByTagName('span')->plaintext);
$projects[$i]["proponent"]=trim($htmlchild->find('.table-list li',3)->getElementByTagName('span')->plaintext);
$projects[$i]["proponentidnumber"]=trim($htmlchild->find('.table-list li',4)->getElementByTagName('span')->plaintext);
// if screenshot API key exists, make lots of additional effort
if ($config["screenshot-apikey"])
{
$content=$htmlchild->find('text');
foreach ($content as $text)
{
if (strpos($text,'Oznámenie o predložení zámeru:')!==FALSE)
{
$infofile=$text->parent->parent->find('li',0)->find('a',0)->href;

curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt ($ch, CURLOPT_URL, 'https://www.enviroportal.sk'.$infofile);
curl_setopt ($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 1);
curl_setopt($ch, CURLOPT_NOBODY, 1);
$content=curl_exec ($ch);
$contenttype=curl_getinfo($ch, CURLINFO_CONTENT_TYPE);
if ($contenttype=='application/rtf')
{
$rtfcontent=file_get_contents('https://www.enviroportal.sk'.$infofile);
break;
}
elseif ($contenttype=='application/zip')
{
copy('https://www.enviroportal.sk'.$infofile,'tmpfile.zip');
$zip=new ZipArchive;
$zip->open('tmpfile.zip');
$zip->extractTo($config["tempdir"],array($zip->getNameIndex(0)));
$rtfcontent=file_get_contents($config["tempdir"].$zip->getNameIndex(0));
unlink($config["tempdir"].$zip->getNameIndex(0));
$zip->close();
break;
}
}
}
// basic info RTF exists, extract exact location and pull a screenshot
if ($rtfcontent)
{
$content=rtf2text($rtfcontent);
$content=str_replace('?','',$content);
$content=preg_replace_callback("/(&#[0-9]+;)/", function($m) { return mb_convert_encoding($m[1], "UTF-8", "HTML-ENTITIES"); }, $content);
$content=preg_replace('{(.)\1+}u','$1',$content);
$parsedtext=preg_split("/\r?\n/",$content);
foreach ($parsedtext as $key=>$line)
{
$parsedtext[$key]=trim($line);
}
foreach ($parsedtext as $key=>$line)
{
if (!$line) unset($parsedtext[$key]);
}
$parsedtext=array_values($parsedtext);
foreach ($parsedtext as $key=>$line)
{
if (stripos($line,'Miesto real')!==FALSE OR stripos($line,'Mesto real')!==FALSE)
{
break;
}
}
/*
if (!$key) echo 'key not found';
else echo 'key: ',$key,'<br />';
*/
$location="";
// loop over the lines following "Miesto realizácie" until the end to find land registry "parcela" number
for ($i=$key+1;$i<=count($parsedtext);$i++)
{
if (strpos($parsedtext[$i],'parc.')!==FALSE OR strpos($parsedtext[$i],'p.')!==FALSE OR strpos($parsedtext[$i],'parcel')!==FALSE OR preg_match('/[0-9]{3,}\/[0-9]{1,}/',$parsedtext[$i],$matches)===1)
{
if (preg_match('/[0-9]{3,}\/[0-9]{1,}/',$parsedtext[$i],$matches)===1)
{
$location=$matches[0];
$location=trim($location);
break;
}
}
}
// loop over the lines following "Miesto realizácie" until the end to identify existing location
if (!$location)
{
$ch=curl_init();
curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_NOBODY, 0);
for ($i=$key+1;$i<=count($parsedtext);$i++)
{
$location=trim($parsedtext[$i]);
//geocoding
curl_setopt($ch, CURLOPT_URL, 'https://nominatim.openstreetmap.org/search?q='.urlencode($location).'&format=json');
$geocode=curl_exec($ch);
$json=json_decode($geocode);
// enforce slovakia, otherwise skip
if (isset($json[0]->lon) AND isset($json[0]->lat) AND (stripos($json[0]->display_name,'Slovakia')!==FALSE OR stripos($json[0]->display_name,'Slovensko')!==FALSE))
{
$long=$json[0]->lon;
$lat=$json[0]->lat;
break;
}
}
}
/*
if (!$long AND !$lat) echo('location not identified');
echo $location;
*/
// find coords based on land registry "parcela" number, if not found by geocoding
if ($location AND !$long AND !$lat)
{
$csv=file($config["landgps"]);
foreach ($csv as $line)
{
if (strpos($line,$location)!==FALSE)
{
$parts=explode(';',$line);
$long=$parts[1];
$lat=$parts[2];
break;
}
}
}
$projects[$i]["long"]=$long;
$projects[$i]["lat"]=$lat;
}
}
$i++;
}
}
Expand All @@ -97,6 +224,17 @@
$mail->Body.=$project["institution"]."\n";
$mail->Body.="http://www.enviroportal.sk".$project["link"]."\n";
$mail->Subject.=" ".$project["name"];
// include screenshot in the email
if ($config["screenshot-apikey"] AND isset($project["long"]) AND isset($project["lat"]))
{
$mapimage=imagecreatefrompng('http://api.screenshotmachine.com/?key='.$config["screenshot-apikey"].'&size=F&format=PNG&cacheLimit=0&timeout=1000&url=http%3A%2F%2Flabs.strava.com%2Fheatmap%2F%2315%2F'.$long.'%2F'.$lat.'%2Fblue%2Fbike');
$mapimagenew=imagecreatetruecolor(579,708);
imagecopy($mapimagenew,$mapimage,0,0,445,60,1024,708);
imagepng($mapimagenew,$config['tempdir'].$hash.'.png');
$mail->AddAttachment($config['tempdir'].$hash.'.png');
imagedestroy($mapimagenew); imagedestroy($mapimage);
}
// if templating enabled, generate document
if ($config["template"] AND !$existingprojectstate)
{
unset($institutionkey);
Expand Down Expand Up @@ -167,6 +305,11 @@
{
echo ', email '.$mail->Subject.' has been sent';
}
if ($config["screenshot-apikey"] AND isset($project["long"]) AND isset($project["lat"]))
{
unlink($config["tempdir"].$filename);
}
$mail->clearAttachments();
echo '<br />';
}
}
Expand Down

0 comments on commit fb1833d

Please sign in to comment.