Skip to content

Commit 20c387e

Browse files
authored
Merge pull request #18 from wahlflo/17-eml_analyzer-text-throws-cyrillic-characters-instead-of-german-umlauts
Fixing Issues with Encodings
2 parents e163290 + 247cb1f commit 20c387e

File tree

8 files changed

+107
-24
lines changed

8 files changed

+107
-24
lines changed

.github/workflows/unit-tests.yml

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,28 @@ on:
88

99
jobs:
1010
build:
11-
12-
runs-on: ubuntu-latest
11+
name: Test on ${{ matrix.os }}
1312
strategy:
1413
fail-fast: false
1514
matrix:
15+
os: [ ubuntu-latest, windows-latest ]
1616
python-version: [ "3.7", "3.8", "3.9", "3.10" , "3.11" ]
1717

18+
runs-on: ${{ matrix.os }}
19+
1820
steps:
1921
- uses: actions/checkout@v3
2022
- name: Set up Python ${{ matrix.python-version }}
2123
uses: actions/setup-python@v3
2224
with:
2325
python-version: ${{ matrix.python-version }}
26+
2427
- name: Install dependencies
2528
run: |
2629
python -m pip install --upgrade pip
2730
python -m pip install pytest
28-
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
31+
python -m pip install -r requirements.txt
32+
2933
- name: Test with pytest
3034
run: |
3135
pytest

eml_analyzer/cli_script.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
def main():
1212
argument_parser = argparse.ArgumentParser(prog='emlAnalyzer', description='A CLI script to analyze an email in the EML format for viewing headers, extracting attachments, etc.')
13-
argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r'), nargs='?', default=sys.stdin)
13+
argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r', encoding='utf-8'), nargs='?', default=sys.stdin)
1414
argument_parser.add_argument('--header', action='store_true', default=False, help="Shows the headers")
1515
argument_parser.add_argument('-x', '--tracking', action='store_true', default=False, help="Shows content which is reloaded from external resources in the HTML part")
1616
argument_parser.add_argument('-a', '--attachments', action='store_true', default=False, help="Lists attachments")

eml_analyzer/library/parser/parsed_email.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
from typing import NamedTuple, List, Tuple, Set
88

99
from eml_analyzer.library.parser.attachment import Attachment
10+
from eml_analyzer.library.parser.printable_filename import decode_ASCII_encoded_string
1011
from eml_analyzer.library.parser.structure_item import StructureItem
1112

1213

@@ -48,7 +49,7 @@ def _add_error_messages(self, error_message: str) -> None:
4849

4950
def get_header(self) -> List[Tuple[str, any]]:
5051
""" returns list of key-value pairs of header entries """
51-
return self._parsed_email.items()
52+
return [(key, decode_ASCII_encoded_string(value)) for key, value in self._parsed_email.items()]
5253

5354
def get_structure(self) -> StructureItem:
5455
return StructureItem(message=self._parsed_email)
@@ -86,14 +87,19 @@ def _get_first_email_payload_with_matching_type(message: email.message.Message,
8687

8788
@staticmethod
8889
def _get_decoded_payload_from_message(message: email.message.Message) -> None or str:
90+
transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
91+
if transfer_encoding in {'7bit', '8bit', 'binary'}:
92+
return message.get_payload(decode=False)
93+
8994
payload_in_bytes = message.get_payload(decode=True)
9095

9196
list_of_possible_encodings = ParsedEmail._create_list_of_possible_encodings(message=message)
9297

9398
for encoding_format in list_of_possible_encodings:
9499
try:
95100
return payload_in_bytes.decode(encoding_format)
96-
except ValueError:
101+
except ValueError as error:
102+
print('Error: ' + str(error))
97103
continue
98104
raise PayloadDecodingException('Payload could not be decoded')
99105

@@ -102,23 +108,44 @@ def _create_list_of_possible_encodings(message: email.message.Message) -> list:
102108
""" creates a list of the most possible encodings of a payload """
103109
list_of_possible_encodings = list()
104110

111+
header_values = ParsedEmail._header_lookup(message=message, key='content-type')
112+
105113
# at first add the encodings mentioned in the object header
106-
for k, v in message.items():
107-
k = str(k).lower()
108-
v = str(v).lower()
109-
if k == 'content-type':
110-
entries = v.split(';')
111-
for entry in entries:
112-
entry = entry.strip()
113-
if entry.startswith('charset='):
114-
encoding = entry.replace('charset=', '').replace('"', '')
115-
list_of_possible_encodings.append(encoding)
114+
for v in header_values:
115+
entries = v.split(';')
116+
for entry in entries:
117+
entry = entry.strip()
118+
if entry.startswith('charset='):
119+
encoding = entry.replace('charset=', '').replace('"', '')
120+
list_of_possible_encodings.append(encoding)
116121

117122
for x in ['utf-8', 'windows-1251', 'iso-8859-1', 'us-ascii', 'iso-8859-15']:
118123
if x not in list_of_possible_encodings:
119124
list_of_possible_encodings.append(x)
120125
return list_of_possible_encodings
121126

127+
@staticmethod
128+
def _payload_needs_decoding(message: email.message.Message) -> bool:
129+
transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
130+
if transfer_encoding is None:
131+
return True
132+
return transfer_encoding not in {'7bit', '8bit', 'binary'}
133+
134+
@staticmethod
135+
def _header_lookup_first_element(message: email.message.Message, key: str) -> str or None:
136+
for header_key, value in message.items():
137+
if str(header_key).lower() == key:
138+
return str(value).lower()
139+
return None
140+
141+
@staticmethod
142+
def _header_lookup(message: email.message.Message, key: str) -> [str]:
143+
values = list()
144+
for header_key, value in message.items():
145+
if str(header_key).lower() == key:
146+
values.append(str(value).lower())
147+
return values
148+
122149
def get_attachments(self) -> List[Attachment]:
123150
return_list = list()
124151
counter = 0

eml_analyzer/library/parser/printable_filename.py

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import binascii
33
import re
44
import email.message
5+
import quopri
56

67

78
def get_printable_filename_if_existent(message: email.message.Message) -> str or None:
@@ -12,7 +13,7 @@ def get_printable_filename_if_existent(message: email.message.Message) -> str or
1213

1314

1415
def _make_string_printable(original_string: str) -> str:
15-
original_string = _decode_ASCII_encoded_string(string=original_string)
16+
original_string = decode_ASCII_encoded_string(string=original_string)
1617

1718
additional_allowed_chars = {'_', '.', '(', ')', '-', ' '}
1819
clean_name = ''
@@ -24,7 +25,13 @@ def _make_string_printable(original_string: str) -> str:
2425
return clean_name
2526

2627

27-
def _decode_ASCII_encoded_string(string: str) -> str:
28+
def decode_ASCII_encoded_string(string: str) -> str:
29+
string = _decode_ASCII_encoded_string_baseX(string=string)
30+
string = _decode_ASCII_encoded_string_quoted_printable_string(string=string)
31+
return string
32+
33+
34+
def _decode_ASCII_encoded_string_baseX(string: str) -> str:
2835
""" decodes ASCII strings which are encoded like: name := "=?UTF-8?B?" + base64_encode(string) + "?=" """
2936
pattern = re.compile(r'=\?(.+?)\?B\?(.+?)\?=', re.IGNORECASE)
3037
for match in list(re.finditer(pattern=pattern, string=string)):
@@ -33,3 +40,17 @@ def _decode_ASCII_encoded_string(string: str) -> str:
3340
except binascii.Error:
3441
pass
3542
return string
43+
44+
45+
def _decode_ASCII_encoded_string_quoted_printable_string(string: str) -> str:
46+
pattern = re.compile(r'=\?(.+?)\?Q\?(.+?)\?=', re.IGNORECASE)
47+
for match in list(re.finditer(pattern=pattern, string=string)):
48+
try:
49+
encoding = match.group(1)
50+
encoded_string = match.group(2)
51+
decoded_string = quopri.decodestring(encoded_string)
52+
replacement = decoded_string.decode(encoding)
53+
string = string.replace(match.group(0), replacement)
54+
except binascii.Error:
55+
pass
56+
return string

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
setuptools.setup(
1010
name="eml-analyzer",
11-
version="2.0.3",
11+
version="3.0.0",
1212
author="Florian Wahl",
1313
author_email="[email protected]",
1414
description="A cli script to analyze an E-Mail in the eml format for viewing the header, extracting attachments, etc.",
Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Message-ID: <[email protected]>
2+
Date: Tue, 25 Jul 2023 16:07:11 +0200
3+
MIME-Version: 1.0
4+
User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
5+
Thunderbird/102.13.0
6+
Content-Language: en-US
7+
8+
9+
Subject: =?UTF-8?Q?Dies_ist_ein_d=c3=a4mlicher_Test?=
10+
Content-Type: text/plain; charset=UTF-8; format=flowed
11+
Content-Transfer-Encoding: 8bit
12+
13+
Dies ist ein dämlicher Test.

tests/library/parser/test_parsed_email.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ def load_test_eml_file(test_file) -> str:
88
current_directory_of_the_script = os.path.dirname(__file__)
99
test_emails = os.path.join(current_directory_of_the_script, 'test_emails')
1010
path_to_test_file = os.path.join(test_emails, test_file)
11-
with open(path_to_test_file, mode='r') as input_file:
11+
with open(path_to_test_file, mode='r', encoding='utf-8') as input_file:
1212
return input_file.read()
1313

1414

@@ -32,7 +32,7 @@ def test_case_1_header_subject(self):
3232
header = x.get_header()
3333
for key, value in header:
3434
if key == 'Subject':
35-
self.assertIn(value, 'UnitTest Subject =?UTF-8?B?TcO8bmNoZW4s?=')
35+
self.assertEqual(value, 'UnitTest Subject München,')
3636
return
3737
self.fail(msg="header subject not found")
3838

@@ -228,4 +228,19 @@ def test_get_reloaded_content_from_html_case_3(self):
228228

229229
def url_decode(self):
230230
import urllib.parse
231-
self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))
231+
self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))
232+
233+
def test_case_uf8_with_umlauts_txt(self):
234+
eml_content = load_test_eml_file('utf8_with_umlauts.eml')
235+
x = ParsedEmail(eml_content=eml_content)
236+
self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.')
237+
238+
def test_case_uf8_with_umlauts_header(self):
239+
eml_content = load_test_eml_file('utf8_with_umlauts.eml')
240+
x = ParsedEmail(eml_content=eml_content)
241+
header = x.get_header()
242+
for key, value in header:
243+
if key == 'Subject':
244+
self.assertEqual(value, 'Dies_ist_ein_dämlicher_Test')
245+
return
246+
self.fail(msg="header subject not found")

tests/library/parser/test_printable_filename.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import unittest
22

3-
from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, _decode_ASCII_encoded_string
3+
from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, decode_ASCII_encoded_string
44

55

66
class TestPrintableFilename(unittest.TestCase):
@@ -12,9 +12,10 @@ def test_decode_ASCII_encoded_string(self):
1212
('=?UTF-8?B?4o6Y7Z+/?=', '⎘퟿'),
1313
('=?utf-8?b?4o6Y7Z+/?=', '⎘퟿'),
1414
('=?utf-16?b?SABlAGwAbABvAFcAbwByAGwAZAA=?=', 'HelloWorld'),
15+
('=?UTF-8?Q?=c3=a4?=', 'ä'),
1516
]
1617
for value, expected in test_cases:
17-
result = _decode_ASCII_encoded_string(string=value)
18+
result = decode_ASCII_encoded_string(string=value)
1819
self.assertEqual(result, expected)
1920

2021
def test_make_string_printable(self):
@@ -24,6 +25,7 @@ def test_make_string_printable(self):
2425
('Hello World', 'Hello World'),
2526
('=?UTF-8?B?7Z+/?=', ''), # character is not printable
2627
('=?UTF-8?B?4o6Y?=', '_'), # character is printable
28+
('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable
2729
]
2830
for value, expected in test_cases:
2931
result = _make_string_printable(original_string=value)
@@ -36,6 +38,7 @@ def test_get_printable_filename_if_existent(self):
3638
('Hello World', 'Hello World'),
3739
('=?UTF-8?B?7Z+/?=', ''), # character is not printable
3840
('=?UTF-8?B?4o6Y?=', '_'), # character is printable
41+
('=?UTF-8?Q?=c3=a4?=', 'ä'), # character is printable
3942
(None, None),
4043
]
4144

0 commit comments

Comments
 (0)