-
Notifications
You must be signed in to change notification settings - Fork 33
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* WIP * Make debugging easy for fix encoding bugs * Fix encoding problem that is #225 #242 * More simple implementation for bytes compatible * Make more simple * Remove debugging code * It is a classmethod, not instance method * Add a test case for suddn EOF * Rename to the correct name * Care multiple scriptencoding * Fix a problem about debug_hint overwriting * Care single line scriptencoding * decoding error is not a RuntimeError but Exception * More debug_hint * Fix a problem about missing last char * Change Chardet priority * Revert "WIP" This reverts commit 1fb7dfc. * Split files * Try to resolve module name conflict * Cosmetic changes * Compose strategies to decoding_strategy
- Loading branch information
Showing
18 changed files
with
302 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import chardet | ||
import sys | ||
from pprint import pprint | ||
from pathlib import Path | ||
from argparse import ArgumentParser | ||
|
||
|
||
def main(file_path): | ||
# type: (Path) -> None | ||
with file_path.open(mode='rb') as f: | ||
bytes_seq = f.read() | ||
|
||
coding_hint = chardet.detect(bytes_seq) | ||
pprint(coding_hint) | ||
|
||
|
||
if __name__ == '__main__': | ||
arg_parser = ArgumentParser(prog='show_ast', description='Show AST') | ||
arg_parser.add_argument('file', nargs=1, help='File to parse') | ||
namespace = vars(arg_parser.parse_args(sys.argv[1:])) | ||
|
||
main(Path(namespace['file'][0])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
#!/usr/bin/env python3 | ||
|
||
import sys | ||
from argparse import ArgumentParser | ||
from pathlib import Path | ||
from pprint import pprint | ||
|
||
vint_root = Path(__file__).resolve().parent.parent | ||
sys.path.insert(0, str(vint_root)) | ||
|
||
from vint.encodings.decoder import Decoder | ||
from vint.encodings.decoding_strategy import default_decoding_strategy | ||
|
||
|
||
if __name__ == '__main__': | ||
arg_parser = ArgumentParser(prog='show_ast', description='Show AST') | ||
arg_parser.add_argument('file', nargs=1, help='File to detect encoding') | ||
namespace = vars(arg_parser.parse_args(sys.argv[1:])) | ||
|
||
file_path = Path(namespace['file'][0]) | ||
decoder = Decoder(default_decoding_strategy) | ||
decoder.read(file_path) | ||
pprint(decoder.debug_hint) |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
echo "Only ASCII" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
scriptencoding cp932 "���{�� |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
scriptencoding utf-8 | ||
" :purple_heart: 💜 | ||
" set list listchars=tab:»·,trail:·,eol:¬,nbsp:_,extends:❯,precedes:❮ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
echo "before 1" | ||
scriptencoding utf8 | ||
echo "before 2" | ||
scriptencoding utf8 | ||
echo "after 2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
echo "no scriptencofing 1" | ||
echo "no scriptencofing 2" | ||
echo "no scriptencofing 3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
echo "no scriptencoding 1" | ||
echo "no scriptencoding 2" | ||
echo "no scriptencoding 3" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
echo "before" | ||
scriptencoding utf8 | ||
echo "after" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
scriptencoding |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
import sys | ||
from typing import Dict, Any | ||
from pprint import pformat | ||
from pathlib import Path | ||
from vint.encodings.decoding_strategy import DecodingStrategy | ||
|
||
|
||
SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii') | ||
|
||
|
||
|
||
class Decoder(object): | ||
def __init__(self, strategy): | ||
# type: (DecodingStrategy) -> None | ||
self.strategy = strategy | ||
self.debug_hint = dict(version=sys.version) | ||
|
||
|
||
def read(self, file_path): | ||
# type: (Path) -> str | ||
|
||
with file_path.open(mode='rb') as f: | ||
bytes_seq = f.read() | ||
strings = [] | ||
|
||
for (loc, hunk) in _split_by_scriptencoding(bytes_seq): | ||
debug_hint_for_the_loc = dict() | ||
self.debug_hint[loc] = debug_hint_for_the_loc | ||
|
||
string = self.strategy.decode(hunk, debug_hint=debug_hint_for_the_loc) | ||
|
||
if string is None: | ||
raise EncodingDetectionError(self.debug_hint) | ||
|
||
strings.append(string) | ||
|
||
return ''.join(strings) | ||
|
||
|
||
def _split_by_scriptencoding(bytes_seq): | ||
# type: (bytes) -> [(str, bytes)] | ||
max_end_index = len(bytes_seq) | ||
start_index = 0 | ||
bytes_seq_and_loc_list = [] | ||
|
||
while True: | ||
end_index = bytes_seq.find(SCRIPTENCODING_PREFIX, start_index + 1) | ||
|
||
if end_index < 0: | ||
end_index = max_end_index | ||
|
||
bytes_seq_and_loc_list.append(( | ||
"{start_index}:{end_index}".format(start_index=start_index, end_index=end_index), | ||
bytes_seq[start_index:end_index] | ||
)) | ||
|
||
if end_index < max_end_index: | ||
start_index = end_index | ||
continue | ||
|
||
return bytes_seq_and_loc_list | ||
|
||
|
||
class EncodingDetectionError(Exception): | ||
def __init__(self, debug_hint): | ||
# type: (Dict[str, Any]) -> None | ||
self.debug_hint = debug_hint | ||
|
||
|
||
def __str__(self): | ||
# type: () -> str | ||
return 'Cannot detect encoding (binary file?): {debug_hint}'.format( | ||
debug_hint=pformat(self.debug_hint) | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
import chardet | ||
from typing import Optional, Dict, Any | ||
|
||
|
||
SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii') | ||
COMMENT_START_TOKEN = bytearray('"', encoding='ascii') | ||
LF = bytearray("\n", encoding='ascii') | ||
|
||
|
||
class DecodingStrategy(object): | ||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, str]) -> Optional[str] | ||
raise NotImplementedError | ||
|
||
|
||
class DecodingStrategyByChardet(DecodingStrategy): | ||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[str] | ||
encoding_hint = chardet.detect(bytearray(bytes_seq)) | ||
encoding = encoding_hint['encoding'] | ||
|
||
debug_hint['chardet_encoding'] = encoding_hint['encoding'] | ||
debug_hint['chardet_confidence'] = encoding_hint['confidence'] | ||
|
||
try: | ||
return bytes_seq.decode(encoding) | ||
|
||
except Exception as e: | ||
debug_hint['chardet_error'] = str(e) | ||
return None | ||
|
||
|
||
class ComposedDecodingStrategy(DecodingStrategy): | ||
def __init__(self, strategies): | ||
# type: ([DecodingStrategy]) -> None | ||
self.strategies = strategies | ||
|
||
|
||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[str] | ||
|
||
debug_hint['composed_strategies'] = [type(strategy).__name__ for strategy in self.strategies] | ||
|
||
for strategy in self.strategies: | ||
string_candidate = strategy.decode(bytes_seq, debug_hint) | ||
|
||
if string_candidate is None: | ||
continue | ||
|
||
debug_hint['selected_strategy'] = type(strategy).__name__ | ||
|
||
return string_candidate | ||
|
||
|
||
class DecodingStrategyForEmpty(DecodingStrategy): | ||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[str] | ||
if len(bytes_seq) <= 0: | ||
debug_hint['empty'] = 'true' | ||
return '' | ||
|
||
debug_hint['empty'] = 'false' | ||
return None | ||
|
||
|
||
class DecodingStrategyByScriptencoding(DecodingStrategy): | ||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[str] | ||
encoding_part = DecodingStrategyByScriptencoding.parse_script_encoding(bytes_seq, debug_hint) | ||
|
||
if encoding_part is None: | ||
debug_hint['scriptencoding'] = 'None' | ||
return None | ||
|
||
try: | ||
debug_hint['scriptencoding'] = encoding_part | ||
return bytes_seq.decode(encoding=encoding_part.decode(encoding='ascii')) | ||
|
||
except LookupError as e: | ||
debug_hint['scriptencoding_error'] = str(e) | ||
return None | ||
|
||
|
||
@classmethod | ||
def parse_script_encoding(cls, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[bytes] | ||
try: | ||
start_index = bytes_seq.index(SCRIPTENCODING_PREFIX) | ||
encoding_part_start_index = start_index + len(SCRIPTENCODING_PREFIX) | ||
|
||
try: | ||
encoding_part_end_index_candidate_by_line_break = bytes_seq.index(LF, encoding_part_start_index) | ||
|
||
try: | ||
encoding_part_end_index_candidate_by_comment = bytes_seq.index( | ||
COMMENT_START_TOKEN, encoding_part_start_index) | ||
|
||
# Case for :scriptencoding foo "foo\n | ||
encoding_part_end_index = min( | ||
encoding_part_end_index_candidate_by_line_break, | ||
encoding_part_end_index_candidate_by_comment | ||
) | ||
|
||
except ValueError: | ||
# Case for :scriptencoding foo\n | ||
encoding_part_end_index = encoding_part_end_index_candidate_by_line_break | ||
|
||
except ValueError: | ||
try: | ||
# Case for :scriptencoding foo "foo<EOF> | ||
encoding_part_end_index_candidate_by_comment = bytes_seq.index( | ||
COMMENT_START_TOKEN, encoding_part_start_index) | ||
encoding_part_end_index = encoding_part_end_index_candidate_by_comment | ||
|
||
except ValueError: | ||
# Case for :scriptencoding foo<EOF> | ||
encoding_part_end_index = len(bytes_seq) - 1 | ||
|
||
encoding_part_candidate = bytes_seq[encoding_part_start_index:encoding_part_end_index] | ||
return encoding_part_candidate.strip() | ||
|
||
except ValueError: | ||
debug_hint['scriptencoding_error'] = '`scriptencoding` is not found' | ||
return None | ||
|
||
|
||
class DecodingStrategyForUTF8(DecodingStrategy): | ||
def decode(self, bytes_seq, debug_hint): | ||
# type: (bytes, Dict[str, Any]) -> Optional[str] | ||
try: | ||
string = bytes_seq.decode('utf8') | ||
|
||
debug_hint['utf-8'] = 'success' | ||
return string | ||
|
||
except Exception as e: | ||
debug_hint['utf-8'] = 'failed: {}'.format(str(e)) | ||
|
||
return None | ||
|
||
|
||
default_decoding_strategy = ComposedDecodingStrategy([ | ||
DecodingStrategyForEmpty(), | ||
DecodingStrategyByScriptencoding(), | ||
DecodingStrategyForUTF8(), | ||
DecodingStrategyByChardet(), | ||
]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters