Merge pull request #18 from wahlflo/17-eml_analyzer-text-throws-cyrillic-characters-instead-of-german-umlauts

wahlflo · web-flow · commit 20c387eb3592 · 2023-08-04T20:49:50.000+02:00
Fixing Issues with Encodings
diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
@@ -8,24 +8,28 @@ on:
 
 jobs:
   build:
-
-    runs-on: ubuntu-latest
+    name: Test on ${{ matrix.os }}
     strategy:
       fail-fast: false
       matrix:
+        os: [ ubuntu-latest, windows-latest ]
         python-version: [ "3.7", "3.8", "3.9", "3.10" , "3.11" ]
 
+    runs-on: ${{ matrix.os }}
+
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v3
       with:
         python-version: ${{ matrix.python-version }}
+
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         python -m pip install pytest
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python -m pip install -r requirements.txt
+
     - name: Test with pytest
       run: |
         pytest
diff --git a/eml_analyzer/cli_script.py b/eml_analyzer/cli_script.py
@@ -10,7 +10,7 @@
 
 def main():
     argument_parser = argparse.ArgumentParser(prog='emlAnalyzer', description='A CLI script to analyze an email in the EML format for viewing headers, extracting attachments, etc.')
-    argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r'), nargs='?', default=sys.stdin)
+    argument_parser.add_argument('-i', '--input', help="Path to the EML file. Accepts standard input if omitted", type=argparse.FileType('r', encoding='utf-8'), nargs='?', default=sys.stdin)
     argument_parser.add_argument('--header', action='store_true', default=False, help="Shows the headers")
     argument_parser.add_argument('-x', '--tracking', action='store_true', default=False, help="Shows content which is reloaded from external resources in the HTML part")
     argument_parser.add_argument('-a', '--attachments', action='store_true', default=False, help="Lists attachments")
diff --git a/eml_analyzer/library/parser/parsed_email.py b/eml_analyzer/library/parser/parsed_email.py
@@ -7,6 +7,7 @@
 from typing import NamedTuple, List, Tuple, Set
 
 from eml_analyzer.library.parser.attachment import Attachment
+from eml_analyzer.library.parser.printable_filename import decode_ASCII_encoded_string
 from eml_analyzer.library.parser.structure_item import StructureItem
 
 
@@ -48,7 +49,7 @@ def _add_error_messages(self, error_message: str) -> None:
 
     def get_header(self) -> List[Tuple[str, any]]:
         """ returns list of key-value pairs of header entries """
-        return self._parsed_email.items()
+        return [(key, decode_ASCII_encoded_string(value)) for key, value in self._parsed_email.items()]
 
     def get_structure(self) -> StructureItem:
         return StructureItem(message=self._parsed_email)
@@ -86,14 +87,19 @@ def _get_first_email_payload_with_matching_type(message: email.message.Message,
 
     @staticmethod
     def _get_decoded_payload_from_message(message: email.message.Message) -> None or str:
+        transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
+        if transfer_encoding in {'7bit', '8bit', 'binary'}:
+            return message.get_payload(decode=False)
+
         payload_in_bytes = message.get_payload(decode=True)
 
         list_of_possible_encodings = ParsedEmail._create_list_of_possible_encodings(message=message)
 
         for encoding_format in list_of_possible_encodings:
             try:
                 return payload_in_bytes.decode(encoding_format)
-            except ValueError:
+            except ValueError as error:
+                print('Error: ' + str(error))
                 continue
         raise PayloadDecodingException('Payload could not be decoded')
 
@@ -102,23 +108,44 @@ def _create_list_of_possible_encodings(message: email.message.Message) -> list:
         """ creates a list of the most possible encodings of a payload """
         list_of_possible_encodings = list()
 
+        header_values = ParsedEmail._header_lookup(message=message, key='content-type')
+
         # at first add the encodings mentioned in the object header
-        for k, v in message.items():
-            k = str(k).lower()
-            v = str(v).lower()
-            if k == 'content-type':
-                entries = v.split(';')
-                for entry in entries:
-                    entry = entry.strip()
-                    if entry.startswith('charset='):
-                        encoding = entry.replace('charset=', '').replace('"', '')
-                        list_of_possible_encodings.append(encoding)
+        for v in header_values:
+            entries = v.split(';')
+            for entry in entries:
+                entry = entry.strip()
+                if entry.startswith('charset='):
+                    encoding = entry.replace('charset=', '').replace('"', '')
+                    list_of_possible_encodings.append(encoding)
 
         for x in ['utf-8', 'windows-1251', 'iso-8859-1', 'us-ascii', 'iso-8859-15']:
             if x not in list_of_possible_encodings:
                 list_of_possible_encodings.append(x)
         return list_of_possible_encodings
 
+    @staticmethod
+    def _payload_needs_decoding(message: email.message.Message) -> bool:
+        transfer_encoding = ParsedEmail._header_lookup_first_element(message=message, key='content-transfer-encoding')
+        if transfer_encoding is None:
+            return True
+        return transfer_encoding not in {'7bit', '8bit', 'binary'}
+
+    @staticmethod
+    def _header_lookup_first_element(message: email.message.Message, key: str) -> str or None:
+        for header_key, value in message.items():
+            if str(header_key).lower() == key:
+                return str(value).lower()
+        return None
+
+    @staticmethod
+    def _header_lookup(message: email.message.Message, key: str) -> [str]:
+        values = list()
+        for header_key, value in message.items():
+            if str(header_key).lower() == key:
+                values.append(str(value).lower())
+        return values
+
     def get_attachments(self) -> List[Attachment]:
         return_list = list()
         counter = 0
diff --git a/eml_analyzer/library/parser/printable_filename.py b/eml_analyzer/library/parser/printable_filename.py
@@ -2,6 +2,7 @@
 import binascii
 import re
 import email.message
+import quopri
 
 
 def get_printable_filename_if_existent(message: email.message.Message) -> str or None:
@@ -12,7 +13,7 @@ def get_printable_filename_if_existent(message: email.message.Message) -> str or
 
 
 def _make_string_printable(original_string: str) -> str:
-    original_string = _decode_ASCII_encoded_string(string=original_string)
+    original_string = decode_ASCII_encoded_string(string=original_string)
 
     additional_allowed_chars = {'_', '.', '(', ')', '-', ' '}
     clean_name = ''
@@ -24,7 +25,13 @@ def _make_string_printable(original_string: str) -> str:
     return clean_name
 
 
-def _decode_ASCII_encoded_string(string: str) -> str:
+def decode_ASCII_encoded_string(string: str) -> str:
+    string = _decode_ASCII_encoded_string_baseX(string=string)
+    string = _decode_ASCII_encoded_string_quoted_printable_string(string=string)
+    return string
+
+
+def _decode_ASCII_encoded_string_baseX(string: str) -> str:
     """ decodes ASCII strings which are encoded like: name := "=?UTF-8?B?" + base64_encode(string) + "?=" """
     pattern = re.compile(r'=\?(.+?)\?B\?(.+?)\?=', re.IGNORECASE)
     for match in list(re.finditer(pattern=pattern, string=string)):
@@ -33,3 +40,17 @@ def _decode_ASCII_encoded_string(string: str) -> str:
         except binascii.Error:
             pass
     return string
+
+
+def _decode_ASCII_encoded_string_quoted_printable_string(string: str) -> str:
+    pattern = re.compile(r'=\?(.+?)\?Q\?(.+?)\?=', re.IGNORECASE)
+    for match in list(re.finditer(pattern=pattern, string=string)):
+        try:
+            encoding = match.group(1)
+            encoded_string = match.group(2)
+            decoded_string = quopri.decodestring(encoded_string)
+            replacement = decoded_string.decode(encoding)
+            string = string.replace(match.group(0), replacement)
+        except binascii.Error:
+            pass
+    return string
diff --git a/setup.py b/setup.py
@@ -8,7 +8,7 @@
 
 setuptools.setup(
     name="eml-analyzer",
-    version="2.0.3",
+    version="3.0.0",
     author="Florian Wahl",
     author_email="florian.wahl.developer@gmail.com",
     description="A cli script to analyze an E-Mail in the eml format for viewing the header, extracting attachments, etc.",
diff --git a/tests/library/parser/test_emails/utf8_with_umlauts.eml b/tests/library/parser/test_emails/utf8_with_umlauts.eml
@@ -0,0 +1,13 @@
+Message-ID: <c18a84e6-cc7e-e22e-63ce-49c76547617f@ra-maier.com>
+Date: Tue, 25 Jul 2023 16:07:11 +0200
+MIME-Version: 1.0
+User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:102.0) Gecko/20100101
+ Thunderbird/102.13.0
+Content-Language: en-US
+To: dmaier@mailbox.org
+From: dmaier@mailbox.org
+Subject: =?UTF-8?Q?Dies_ist_ein_d=c3=a4mlicher_Test?=
+Content-Type: text/plain; charset=UTF-8; format=flowed
+Content-Transfer-Encoding: 8bit
+
+Dies ist ein dämlicher Test.
diff --git a/tests/library/parser/test_parsed_email.py b/tests/library/parser/test_parsed_email.py
@@ -8,7 +8,7 @@ def load_test_eml_file(test_file) -> str:
     current_directory_of_the_script = os.path.dirname(__file__)
     test_emails = os.path.join(current_directory_of_the_script, 'test_emails')
     path_to_test_file = os.path.join(test_emails, test_file)
-    with open(path_to_test_file, mode='r') as input_file:
+    with open(path_to_test_file, mode='r', encoding='utf-8') as input_file:
         return input_file.read()
 
 
@@ -32,7 +32,7 @@ def test_case_1_header_subject(self):
         header = x.get_header()
         for key, value in header:
             if key == 'Subject':
-                self.assertIn(value, 'UnitTest Subject =?UTF-8?B?TcO8bmNoZW4s?=')
+                self.assertEqual(value, 'UnitTest Subject München,')
                 return
         self.fail(msg="header subject not found")
 
@@ -228,4 +228,19 @@ def test_get_reloaded_content_from_html_case_3(self):
 
     def url_decode(self):
         import urllib.parse
-        self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))
+        self.assertEqual(r"data=05%7C01", urllib.parse.unquote(r"data=05|01"))
+
+    def test_case_uf8_with_umlauts_txt(self):
+        eml_content = load_test_eml_file('utf8_with_umlauts.eml')
+        x = ParsedEmail(eml_content=eml_content)
+        self.assertEqual(x.get_text_content().replace('\n', ' ').strip(), 'Dies ist ein dämlicher Test.')
+
+    def test_case_uf8_with_umlauts_header(self):
+        eml_content = load_test_eml_file('utf8_with_umlauts.eml')
+        x = ParsedEmail(eml_content=eml_content)
+        header = x.get_header()
+        for key, value in header:
+            if key == 'Subject':
+                self.assertEqual(value, 'Dies_ist_ein_dämlicher_Test')
+                return
+        self.fail(msg="header subject not found")
diff --git a/tests/library/parser/test_printable_filename.py b/tests/library/parser/test_printable_filename.py
@@ -1,6 +1,6 @@
 import unittest
 
-from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, _decode_ASCII_encoded_string
+from eml_analyzer.library.parser.printable_filename import get_printable_filename_if_existent, _make_string_printable, decode_ASCII_encoded_string
 
 
 class TestPrintableFilename(unittest.TestCase):
@@ -12,9 +12,10 @@ def test_decode_ASCII_encoded_string(self):
             ('=?UTF-8?B?4o6Y7Z+/?=', '⎘퟿'),
             ('=?utf-8?b?4o6Y7Z+/?=', '⎘퟿'),
             ('=?utf-16?b?SABlAGwAbABvAFcAbwByAGwAZAA=?=', 'HelloWorld'),
+            ('=?UTF-8?Q?=c3=a4?=', 'ä'),
         ]
         for value, expected in test_cases:
-            result = _decode_ASCII_encoded_string(string=value)
+            result = decode_ASCII_encoded_string(string=value)
             self.assertEqual(result, expected)
 
     def test_make_string_printable(self):
@@ -24,6 +25,7 @@ def test_make_string_printable(self):
             ('Hello World', 'Hello World'),
             ('=?UTF-8?B?7Z+/?=', ''),  # character is not printable
             ('=?UTF-8?B?4o6Y?=', '_'),  # character is printable
+            ('=?UTF-8?Q?=c3=a4?=', 'ä'),  # character is printable
         ]
         for value, expected in test_cases:
             result = _make_string_printable(original_string=value)
@@ -36,6 +38,7 @@ def test_get_printable_filename_if_existent(self):
             ('Hello World', 'Hello World'),
             ('=?UTF-8?B?7Z+/?=', ''),  # character is not printable
             ('=?UTF-8?B?4o6Y?=', '_'),  # character is printable
+            ('=?UTF-8?Q?=c3=a4?=', 'ä'),  # character is printable
             (None, None),
         ]