Skip to content

Commit

Permalink
Try to fix #255, #242 (#248)
Browse files Browse the repository at this point in the history
* WIP

* Make debugging easy for fix encoding bugs

* Fix encoding problem that is #225 #242

* More simple implementation for bytes compatible

* Make more simple

* Remove debugging code

* It is a classmethod, not instance method

* Add a test case for suddn EOF

* Rename to the correct name

* Care multiple scriptencoding

* Fix a problem about debug_hint overwriting

* Care single line scriptencoding

* decoding error is not a RuntimeError but Exception

* More debug_hint

* Fix a problem about missing last char

* Change Chardet priority

* Revert "WIP"

This reverts commit 1fb7dfc.

* Split files

* Try to resolve module name conflict

* Cosmetic changes

* Compose strategies to decoding_strategy
  • Loading branch information
Kuniwak authored Nov 11, 2017
1 parent 79b0f1a commit d4aa952
Show file tree
Hide file tree
Showing 18 changed files with 302 additions and 29 deletions.
4 changes: 2 additions & 2 deletions dev_tool/show_ast.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
#!/usr/bin/env python
#!/usr/bin/env python3

import sys
from argparse import ArgumentParser
from pathlib import Path
from pprint import pprint

vint_root = Path(__file__).resolve().parent.parent
sys.path.append(str(vint_root))
sys.path.insert(0, str(vint_root))

from vint.ast.node_type import NodeType
from vint.ast.traversing import traverse
Expand Down
24 changes: 24 additions & 0 deletions dev_tool/show_chardet_result.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/usr/bin/env python3

import chardet
import sys
from pprint import pprint
from pathlib import Path
from argparse import ArgumentParser


def main(file_path):
# type: (Path) -> None
with file_path.open(mode='rb') as f:
bytes_seq = f.read()

coding_hint = chardet.detect(bytes_seq)
pprint(coding_hint)


if __name__ == '__main__':
arg_parser = ArgumentParser(prog='show_ast', description='Show AST')
arg_parser.add_argument('file', nargs=1, help='File to parse')
namespace = vars(arg_parser.parse_args(sys.argv[1:]))

main(Path(namespace['file'][0]))
23 changes: 23 additions & 0 deletions dev_tool/show_encoding.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/usr/bin/env python3

import sys
from argparse import ArgumentParser
from pathlib import Path
from pprint import pprint

vint_root = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(vint_root))

from vint.encodings.decoder import Decoder
from vint.encodings.decoding_strategy import default_decoding_strategy


if __name__ == '__main__':
arg_parser = ArgumentParser(prog='show_ast', description='Show AST')
arg_parser.add_argument('file', nargs=1, help='File to detect encoding')
namespace = vars(arg_parser.parse_args(sys.argv[1:]))

file_path = Path(namespace['file'][0])
decoder = Decoder(default_decoding_strategy)
decoder.read(file_path)
pprint(decoder.debug_hint)
Empty file.
1 change: 1 addition & 0 deletions test/fixture/encodings/ascii.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
echo "Only ASCII"
1 change: 1 addition & 0 deletions test/fixture/encodings/cp932.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
scriptencoding cp932 "���{��
Empty file.
3 changes: 3 additions & 0 deletions test/fixture/encodings/issue-225.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
scriptencoding utf-8
" :purple_heart: 💜
" set list listchars=tab:»·,trail:·,eol:¬,nbsp:_,extends:❯,precedes:❮
5 changes: 5 additions & 0 deletions test/fixture/encodings/multiple-scriptencoding.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
echo "before 1"
scriptencoding utf8
echo "before 2"
scriptencoding utf8
echo "after 2"
3 changes: 3 additions & 0 deletions test/fixture/encodings/no-scriptencoding.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
echo "no scriptencofing 1"
echo "no scriptencofing 2"
echo "no scriptencofing 3"
3 changes: 3 additions & 0 deletions test/fixture/encodings/no_scriptencoding.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
echo "no scriptencoding 1"
echo "no scriptencoding 2"
echo "no scriptencoding 3"
3 changes: 3 additions & 0 deletions test/fixture/encodings/single-scriptencoding.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
echo "before"
scriptencoding utf8
echo "after"
1 change: 1 addition & 0 deletions test/fixture/encodings/sudden_eof.vim
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
scriptencoding
36 changes: 10 additions & 26 deletions vint/ast/parsing.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,9 @@
import chardet
import re
from vint._bundles import vimlparser
from vint.ast.traversing import traverse


class EncodingDetectionError(Exception):
def __init__(self, file_path):
self.file_path = file_path


def __str__(self):
return 'Cannot detect encoding (binary file?): {file_path}'.format(
file_path=str(self.file_path))
from vint.encodings.decoder import Decoder
from vint.encodings.decoding_strategy import default_decoding_strategy
from pprint import pprint


class Parser(object):
Expand Down Expand Up @@ -42,23 +34,15 @@ def parse(self, string):

def parse_file(self, file_path):
""" Parse vim script file and return the AST. """
with file_path.open(mode='rb') as f:
bytes_seq = f.read()

is_empty = len(bytes_seq) == 0
if is_empty:
return self.parse('')

encoding_hint = chardet.detect(bytes_seq)
encoding = encoding_hint['encoding']
if not encoding:
# Falsey means we cannot detect the encoding of the file.
raise EncodingDetectionError(file_path)

decoded = bytes_seq.decode(encoding)
decoded_and_lf_normalized = decoded.replace('\r\n', '\n')
decoder = Decoder(default_decoding_strategy)
decoded = decoder.read(file_path)
decoded_and_lf_normalized = decoded.replace('\r\n', '\n')

try:
return self.parse(decoded_and_lf_normalized)
except vimlparser.VimLParserException:
pprint(decoder.debug_hint)
raise


def parse_redir(self, redir_cmd):
Expand Down
Empty file added vint/encodings/__init__.py
Empty file.
74 changes: 74 additions & 0 deletions vint/encodings/decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import sys
from typing import Dict, Any
from pprint import pformat
from pathlib import Path
from vint.encodings.decoding_strategy import DecodingStrategy


SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii')



class Decoder(object):
def __init__(self, strategy):
# type: (DecodingStrategy) -> None
self.strategy = strategy
self.debug_hint = dict(version=sys.version)


def read(self, file_path):
# type: (Path) -> str

with file_path.open(mode='rb') as f:
bytes_seq = f.read()
strings = []

for (loc, hunk) in _split_by_scriptencoding(bytes_seq):
debug_hint_for_the_loc = dict()
self.debug_hint[loc] = debug_hint_for_the_loc

string = self.strategy.decode(hunk, debug_hint=debug_hint_for_the_loc)

if string is None:
raise EncodingDetectionError(self.debug_hint)

strings.append(string)

return ''.join(strings)


def _split_by_scriptencoding(bytes_seq):
# type: (bytes) -> [(str, bytes)]
max_end_index = len(bytes_seq)
start_index = 0
bytes_seq_and_loc_list = []

while True:
end_index = bytes_seq.find(SCRIPTENCODING_PREFIX, start_index + 1)

if end_index < 0:
end_index = max_end_index

bytes_seq_and_loc_list.append((
"{start_index}:{end_index}".format(start_index=start_index, end_index=end_index),
bytes_seq[start_index:end_index]
))

if end_index < max_end_index:
start_index = end_index
continue

return bytes_seq_and_loc_list


class EncodingDetectionError(Exception):
def __init__(self, debug_hint):
# type: (Dict[str, Any]) -> None
self.debug_hint = debug_hint


def __str__(self):
# type: () -> str
return 'Cannot detect encoding (binary file?): {debug_hint}'.format(
debug_hint=pformat(self.debug_hint)
)
147 changes: 147 additions & 0 deletions vint/encodings/decoding_strategy.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import chardet
from typing import Optional, Dict, Any


SCRIPTENCODING_PREFIX = bytearray('scriptencoding', encoding='ascii')
COMMENT_START_TOKEN = bytearray('"', encoding='ascii')
LF = bytearray("\n", encoding='ascii')


class DecodingStrategy(object):
def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, str]) -> Optional[str]
raise NotImplementedError


class DecodingStrategyByChardet(DecodingStrategy):
def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[str]
encoding_hint = chardet.detect(bytearray(bytes_seq))
encoding = encoding_hint['encoding']

debug_hint['chardet_encoding'] = encoding_hint['encoding']
debug_hint['chardet_confidence'] = encoding_hint['confidence']

try:
return bytes_seq.decode(encoding)

except Exception as e:
debug_hint['chardet_error'] = str(e)
return None


class ComposedDecodingStrategy(DecodingStrategy):
def __init__(self, strategies):
# type: ([DecodingStrategy]) -> None
self.strategies = strategies


def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[str]

debug_hint['composed_strategies'] = [type(strategy).__name__ for strategy in self.strategies]

for strategy in self.strategies:
string_candidate = strategy.decode(bytes_seq, debug_hint)

if string_candidate is None:
continue

debug_hint['selected_strategy'] = type(strategy).__name__

return string_candidate


class DecodingStrategyForEmpty(DecodingStrategy):
def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[str]
if len(bytes_seq) <= 0:
debug_hint['empty'] = 'true'
return ''

debug_hint['empty'] = 'false'
return None


class DecodingStrategyByScriptencoding(DecodingStrategy):
def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[str]
encoding_part = DecodingStrategyByScriptencoding.parse_script_encoding(bytes_seq, debug_hint)

if encoding_part is None:
debug_hint['scriptencoding'] = 'None'
return None

try:
debug_hint['scriptencoding'] = encoding_part
return bytes_seq.decode(encoding=encoding_part.decode(encoding='ascii'))

except LookupError as e:
debug_hint['scriptencoding_error'] = str(e)
return None


@classmethod
def parse_script_encoding(cls, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[bytes]
try:
start_index = bytes_seq.index(SCRIPTENCODING_PREFIX)
encoding_part_start_index = start_index + len(SCRIPTENCODING_PREFIX)

try:
encoding_part_end_index_candidate_by_line_break = bytes_seq.index(LF, encoding_part_start_index)

try:
encoding_part_end_index_candidate_by_comment = bytes_seq.index(
COMMENT_START_TOKEN, encoding_part_start_index)

# Case for :scriptencoding foo "foo\n
encoding_part_end_index = min(
encoding_part_end_index_candidate_by_line_break,
encoding_part_end_index_candidate_by_comment
)

except ValueError:
# Case for :scriptencoding foo\n
encoding_part_end_index = encoding_part_end_index_candidate_by_line_break

except ValueError:
try:
# Case for :scriptencoding foo "foo<EOF>
encoding_part_end_index_candidate_by_comment = bytes_seq.index(
COMMENT_START_TOKEN, encoding_part_start_index)
encoding_part_end_index = encoding_part_end_index_candidate_by_comment

except ValueError:
# Case for :scriptencoding foo<EOF>
encoding_part_end_index = len(bytes_seq) - 1

encoding_part_candidate = bytes_seq[encoding_part_start_index:encoding_part_end_index]
return encoding_part_candidate.strip()

except ValueError:
debug_hint['scriptencoding_error'] = '`scriptencoding` is not found'
return None


class DecodingStrategyForUTF8(DecodingStrategy):
def decode(self, bytes_seq, debug_hint):
# type: (bytes, Dict[str, Any]) -> Optional[str]
try:
string = bytes_seq.decode('utf8')

debug_hint['utf-8'] = 'success'
return string

except Exception as e:
debug_hint['utf-8'] = 'failed: {}'.format(str(e))

return None


default_decoding_strategy = ComposedDecodingStrategy([
DecodingStrategyForEmpty(),
DecodingStrategyByScriptencoding(),
DecodingStrategyForUTF8(),
DecodingStrategyByChardet(),
])
3 changes: 2 additions & 1 deletion vint/linting/linter.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
import logging
from pathlib import Path
from vint._bundles import vimlparser
from vint.ast.parsing import Parser, EncodingDetectionError
from vint.encodings.decoder import EncodingDetectionError
from vint.ast.parsing import Parser
from vint.ast.node_type import NodeType
from vint.ast.traversing import traverse
from vint.ast.plugin.scope_plugin import ScopePlugin
Expand Down

0 comments on commit d4aa952

Please sign in to comment.