-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
RTF parsing + screenshotting capability added for further information
- Loading branch information
Showing
3 changed files
with
297 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,148 @@ | ||
<?php | ||
|
||
function rtf_isPlainText($s) { | ||
$arrfailAt = array("*", "fonttbl", "colortbl", "datastore", "themedata"); | ||
for ($i = 0; $i < count($arrfailAt); $i++) | ||
if (!empty($s[$arrfailAt[$i]])) return false; | ||
return true; | ||
} | ||
|
||
function rtf2text($text) { | ||
if (!strlen($text)) | ||
return ""; | ||
|
||
// Create empty stack array. | ||
$document = ""; | ||
$stack = array(); | ||
$j = -1; | ||
// Read the data character-by- character… | ||
for ($i = 0, $len = strlen($text); $i < $len; $i++) { | ||
$c = $text[$i]; | ||
|
||
// Depending on current character select the further actions. | ||
switch ($c) { | ||
// the most important key word backslash | ||
case "\\": | ||
// read next character | ||
$nc = $text[$i + 1]; | ||
|
||
// If it is another backslash or nonbreaking space or hyphen, | ||
// then the character is plain text and add it to the output stream. | ||
if ($nc == '\\' && rtf_isPlainText($stack[$j])) $document .= '\\'; | ||
elseif ($nc == '~' && rtf_isPlainText($stack[$j])) $document .= ' '; | ||
elseif ($nc == '_' && rtf_isPlainText($stack[$j])) $document .= '-'; | ||
// If it is an asterisk mark, add it to the stack. | ||
elseif ($nc == '*') $stack[$j]["*"] = true; | ||
// If it is a single quote, read next two characters that are the hexadecimal notation | ||
// of a character we should add to the output stream. | ||
elseif ($nc == "'") { | ||
$hex = substr($text, $i + 2, 2); | ||
if (rtf_isPlainText($stack[$j])) | ||
$document .= html_entity_decode("&#".hexdec($hex).";"); | ||
//Shift the pointer. | ||
$i += 2; | ||
// Since, we’ve found the alphabetic character, the next characters are control word | ||
// and, possibly, some digit parameter. | ||
} elseif ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') { | ||
$word = ""; | ||
$param = null; | ||
|
||
// Start reading characters after the backslash. | ||
for ($k = $i + 1, $m = 0; $k < strlen($text); $k++, $m++) { | ||
$nc = $text[$k]; | ||
// If the current character is a letter and there were no digits before it, | ||
// then we’re still reading the control word. If there were digits, we should stop | ||
// since we reach the end of the control word. | ||
if ($nc >= 'a' && $nc <= 'z' || $nc >= 'A' && $nc <= 'Z') { | ||
if (empty($param)) | ||
$word .= $nc; | ||
else | ||
break; | ||
// If it is a digit, store the parameter. | ||
} elseif ($nc >= '0' && $nc <= '9') | ||
$param .= $nc; | ||
// Since minus sign may occur only before a digit parameter, check whether | ||
// $param is empty. Otherwise, we reach the end of the control word. | ||
elseif ($nc == '-') { | ||
if (empty($param)) | ||
$param .= $nc; | ||
else | ||
break; | ||
} else | ||
break; | ||
} | ||
// Shift the pointer on the number of read characters. | ||
$i += $m - 1; | ||
|
||
// Start analyzing what we’ve read. We are interested mostly in control words. | ||
$toText = ""; | ||
switch (strtolower($word)) { | ||
// If the control word is "u", then its parameter is the decimal notation of the | ||
// Unicode character that should be added to the output stream. | ||
// We need to check whether the stack contains \ucN control word. If it does, | ||
// we should remove the N characters from the output stream. | ||
case "u": | ||
//$toText .= html_entity_decode("&#x".dechex($param).";"); | ||
$ucDelta = @$stack[$j]["uc"]; | ||
if ($ucDelta > 0) | ||
$i += $ucDelta; | ||
break; | ||
// Select line feeds, spaces and tabs. | ||
case "par": case "page": case "column": case "line": case "lbr": | ||
$toText .= "\n"; | ||
break; | ||
case "emspace": case "enspace": case "qmspace": | ||
$toText .= " "; | ||
break; | ||
case "tab": $toText .= "\t"; break; | ||
// Add current date and time instead of corresponding labels. | ||
case "chdate": $toText .= date("m.d.Y"); break; | ||
case "chdpl": $toText .= date("l, j F Y"); break; | ||
case "chdpa": $toText .= date("D, j M Y"); break; | ||
case "chtime": $toText .= date("H:i:s"); break; | ||
// Replace some reserved characters to their html analogs. | ||
case "emdash": $toText .= html_entity_decode("—"); break; | ||
case "endash": $toText .= html_entity_decode("–"); break; | ||
case "bullet": $toText .= html_entity_decode("•"); break; | ||
case "lquote": $toText .= html_entity_decode("‘"); break; | ||
case "rquote": $toText .= html_entity_decode("’"); break; | ||
case "ldblquote": $toText .= html_entity_decode("«"); break; | ||
case "rdblquote": $toText .= html_entity_decode("»"); break; | ||
// Add all other to the control words stack. If a control word | ||
// does not include parameters, set ¶m to true. | ||
default: | ||
$stack[$j][strtolower($word)] = empty($param) ? true : $param; | ||
break; | ||
} | ||
// Add data to the output stream if required. | ||
if (rtf_isPlainText($stack[$j])) | ||
$document .= $toText; | ||
} | ||
|
||
$i++; | ||
break; | ||
// If we read the opening brace {, then new subgroup starts and we add | ||
// new array stack element and write the data from previous stack element to it. | ||
case "{": | ||
array_push($stack, $stack[$j++]); | ||
break; | ||
// If we read the closing brace }, then we reach the end of subgroup and should remove | ||
// the last stack element. | ||
case "}": | ||
array_pop($stack); | ||
$j--; | ||
break; | ||
// Skip “trash”. | ||
case '\0': case '\r': case '\f': case '\n': break; | ||
// Add other data to the output stream if required. | ||
default: | ||
if (rtf_isPlainText($stack[$j])) | ||
$document .= $c; | ||
break; | ||
} | ||
} | ||
// Return result. | ||
return $document; | ||
} | ||
|
||
?> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters