From 82c1a8ee0d3529147ce62e180b68cf2aeae9853a Mon Sep 17 00:00:00 2001 From: Eric McDonald <221418+emcd@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:21:37 -0700 Subject: [PATCH 1/2] Reuse detextive line separator --- sources/mimeogram/acquirers.py | 147 +++--------------- sources/mimeogram/formatters.py | 2 +- sources/mimeogram/parsers.py | 8 +- sources/mimeogram/parts.py | 42 +---- sources/mimeogram/updaters.py | 2 +- .../test_000_mimeogram/test_500_acquirers.py | 99 ------------ 6 files changed, 33 insertions(+), 267 deletions(-) diff --git a/sources/mimeogram/acquirers.py b/sources/mimeogram/acquirers.py index a4fcebb..471858a 100644 --- a/sources/mimeogram/acquirers.py +++ b/sources/mimeogram/acquirers.py @@ -78,12 +78,18 @@ async def _acquire_from_file( location: __.Path ) -> _parts.Part: async with _aiofiles.open( location, 'rb' ) as f: # pyright: ignore content_bytes = await f.read( ) except Exception as exc: raise ContentAcquireFailure( location ) from exc - mimetype, charset = _detect_mimetype_and_charset( content_bytes, location ) + mimetype, charset = __.detextive.detect_mimetype_and_charset( + content_bytes, location + ) + if not __.detextive.is_textual_mimetype( mimetype ): + raise _exceptions.TextualMimetypeInvalidity( + str( location ), mimetype + ) if charset is None: raise ContentDecodeFailure( location, '???' ) - linesep = _parts.LineSeparators.detect_bytes( content_bytes ) + linesep = __.detextive.LineSeparators.detect_bytes( content_bytes ) if linesep is None: _scribe.warning( f"No line separator detected in '{location}'." ) - linesep = _parts.LineSeparators( __.os.linesep ) + linesep = __.detextive.LineSeparators( __.os.linesep ) try: content = content_bytes.decode( charset ) except Exception as exc: raise ContentDecodeFailure( location, charset ) from exc @@ -109,16 +115,25 @@ async def _acquire_via_http( response.headers.get( 'content-type', 'application/octet-stream' ) .split( ';' )[ 0 ].strip( ) ) content_bytes = response.content - charset = response.encoding or _detect_charset( content_bytes ) + charset = response.encoding or __.detextive.detect_charset( + content_bytes + ) if charset is None: raise ContentDecodeFailure( url, '???' ) - if not _is_textual_mimetype( mimetype ): - mimetype, _ = ( - _detect_mimetype_and_charset( - content_bytes, url, charset = charset ) ) - linesep = _parts.LineSeparators.detect_bytes( content_bytes ) + if not __.detextive.is_textual_mimetype( mimetype ): + try: + mimetype, _ = __.detextive.detect_mimetype_and_charset( + content_bytes, url, charset = charset + ) + except __.detextive.exceptions.TextualMimetypeInvalidity as exc: + raise _exceptions.TextualMimetypeInvalidity( + url, mimetype + ) from exc + if not __.detextive.is_textual_mimetype( mimetype ): + raise _exceptions.TextualMimetypeInvalidity( url, mimetype ) + linesep = __.detextive.LineSeparators.detect_bytes( content_bytes ) if linesep is None: _scribe.warning( f"No line separator detected in '{url}'." ) - linesep = _parts.LineSeparators( __.os.linesep ) + linesep = __.detextive.LineSeparators( __.os.linesep ) try: content = content_bytes.decode( charset ) except Exception as exc: raise ContentDecodeFailure( url, charset ) from exc @@ -157,102 +172,6 @@ def _collect_directory_files( return paths -def _detect_charset( content: bytes ) -> str | None: - from chardet import detect - charset = detect( content )[ 'encoding' ] - if charset is None: return charset - if charset.startswith( 'utf' ): return charset - match charset: - case 'ascii': return 'utf-8' # Assume superset. - case _: pass - # Shake out false positives, like 'MacRoman'. - try: content.decode( 'utf-8' ) - except UnicodeDecodeError: return charset - return 'utf-8' - - -def _detect_mimetype( content: bytes, location: str | __.Path ) -> str | None: - from mimetypes import guess_type - from puremagic import PureError, from_string # pyright: ignore - try: return from_string( content, mime = True ) - except ( PureError, ValueError ): - return guess_type( str( location ) )[ 0 ] - - -def _detect_mimetype_and_charset( - content: bytes, - location: str | __.Path, *, - mimetype: __.Absential[ str ] = __.absent, - charset: __.Absential[ str ] = __.absent, -) -> tuple[ str, str | None ]: - from .exceptions import TextualMimetypeInvalidity - if __.is_absent( mimetype ): - mimetype_ = _detect_mimetype( content, location ) - else: mimetype_ = mimetype - if __.is_absent( charset ): # noqa: SIM108 - charset_ = _detect_charset( content ) - else: charset_ = charset - if not mimetype_: - if charset_: - mimetype_ = 'text/plain' - _validate_mimetype_with_trial_decode( - content, location, mimetype_, charset_ ) - return mimetype_, charset_ - mimetype_ = 'application/octet-stream' - if _is_textual_mimetype( mimetype_ ): - return mimetype_, charset_ - if charset_ is None: - raise TextualMimetypeInvalidity( location, mimetype_ ) - _validate_mimetype_with_trial_decode( - content, location, mimetype_, charset_ ) - return mimetype_, charset_ - - -def _is_reasonable_text_content( content: str ) -> bool: - ''' Checks if decoded content appears to be meaningful text. ''' - if not content: return False - # Check for excessive repetition of single characters (likely binary) - if len( set( content ) ) == 1: return False - # Check for excessive control characters (excluding common whitespace) - common_whitespace = '\t\n\r' - ascii_control_limit = 32 - control_chars = sum( - 1 for c in content - if ord( c ) < ascii_control_limit and c not in common_whitespace ) - if control_chars > len( content ) * 0.1: return False # >10% control chars - # Check for reasonable printable character ratio - printable_chars = sum( - 1 for c in content if c.isprintable( ) or c in common_whitespace ) - return printable_chars >= len( content ) * 0.8 # >=80% printable - - -# MIME types that are considered textual beyond those starting with 'text/'. -_TEXTUAL_MIME_TYPES = frozenset( ( - 'application/json', - 'application/xml', - 'application/xhtml+xml', - 'application/x-perl', - 'application/x-python', - 'application/x-php', - 'application/x-ruby', - 'application/x-shell', - 'application/javascript', - 'image/svg+xml', -) ) -# MIME type suffixes that indicate textual content. -_TEXTUAL_SUFFIXES = ( '+xml', '+json', '+yaml', '+toml' ) -def _is_textual_mimetype( mimetype: str ) -> bool: - ''' Checks if MIME type represents textual content. ''' - _scribe.debug( f"MIME type: {mimetype}" ) - if mimetype.startswith( ( 'text/', 'text/x-' ) ): return True - if mimetype in _TEXTUAL_MIME_TYPES: return True - if mimetype.endswith( _TEXTUAL_SUFFIXES ): - _scribe.debug( - f"MIME type '{mimetype}' accepted due to textual suffix." ) - return True - return False - - def _produce_fs_tasks( location: str | __.Path, recursive: bool = False ) -> tuple[ __.cabc.Coroutine[ None, None, _parts.Part ], ...]: @@ -277,19 +196,3 @@ async def _execute_session( ) -> _parts.Part: ) as client: return await _acquire_via_http( client, url ) return _execute_session( ) - - -def _validate_mimetype_with_trial_decode( - content: bytes, location: str | __.Path, mimetype: str, charset: str -) -> None: - ''' Validates charset fallback and returns appropriate MIME type. ''' - from .exceptions import TextualMimetypeInvalidity - try: text = content.decode( charset ) - except ( UnicodeDecodeError, LookupError ) as exc: - raise TextualMimetypeInvalidity( location, mimetype ) from exc - if _is_reasonable_text_content( text ): - _scribe.debug( - f"MIME type '{mimetype}' accepted after successful " - f"decode test with charset '{charset}' for '{location}'." ) - return - raise TextualMimetypeInvalidity( location, mimetype ) diff --git a/sources/mimeogram/formatters.py b/sources/mimeogram/formatters.py index 585299e..b750462 100644 --- a/sources/mimeogram/formatters.py +++ b/sources/mimeogram/formatters.py @@ -45,7 +45,7 @@ def format_mimeogram( location = 'mimeogram://message', mimetype = 'text/plain', # TODO? Markdown charset = 'utf-8', - linesep = _parts.LineSeparators.LF, + linesep = __.detextive.LineSeparators.LF, content = message ) lines.append( format_part( message_part, boundary ) ) for part in parts: diff --git a/sources/mimeogram/parsers.py b/sources/mimeogram/parsers.py index 5d0ccf4..0863595 100644 --- a/sources/mimeogram/parsers.py +++ b/sources/mimeogram/parsers.py @@ -109,17 +109,19 @@ def _parse_descriptor_and_content( _QUOTES = '"\'' -def _parse_mimetype( header: str ) -> tuple[ str, str, _parts.LineSeparators ]: +def _parse_mimetype( header: str ) -> tuple[ + str, str, __.detextive.LineSeparators +]: ''' Extracts MIME type and charset from Content-Type header. ''' parts = [ p.strip( ) for p in header.split( ';' ) ] mimetype = parts[ 0 ] charset = 'utf-8' - linesep = _parts.LineSeparators.LF + linesep = __.detextive.LineSeparators.LF for part in parts[ 1: ]: if part.startswith( 'charset=' ): charset = part[ 8: ].strip( _QUOTES ) if part.startswith( 'linesep=' ): - linesep = _parts.LineSeparators[ + linesep = __.detextive.LineSeparators[ part[ 8: ].strip( _QUOTES ).upper( ) ] return mimetype, charset, linesep diff --git a/sources/mimeogram/parts.py b/sources/mimeogram/parts.py index e460f59..cd131d2 100644 --- a/sources/mimeogram/parts.py +++ b/sources/mimeogram/parts.py @@ -24,47 +24,7 @@ from . import __ from . import fsprotect as _fsprotect - -class LineSeparators( __.enum.Enum ): - ''' Line separators for various platforms. ''' - - CR = '\r' # Classic MacOS - CRLF = '\r\n' # DOS/Windows - LF = '\n' # Unix/Linux - - @classmethod - def detect_bytes( - selfclass, content: bytes, limit = 1024 - ) -> "LineSeparators | None": - ''' Detects newline characters in bytes array. ''' - sample = content[ : limit ] - found_cr = False - for byte in sample: - match byte: - case 0xd: - if found_cr: return selfclass.CR - found_cr = True - case 0xa: # linefeed - if found_cr: return selfclass.CRLF - return selfclass.LF - case _: - if found_cr: return selfclass.CR - return None - - @classmethod - def normalize_universal( selfclass, content: str ) -> str: - ''' Normalizes all varieties of newline characters in text. ''' - return content.replace( '\r\n', '\r' ).replace( '\r', '\n' ) - - def nativize( self, content: str ) -> str: - ''' Nativizes specific variety newline characters in text. ''' - if LineSeparators.LF is self: return content - return content.replace( '\n', self.value ) - - def normalize( self, content: str ) -> str: - ''' Normalizes specific variety newline characters in text. ''' - if LineSeparators.LF is self: return content - return content.replace( self.value, '\n' ) +LineSeparators = __.detextive.LineSeparators class Resolutions( __.enum.Enum ): diff --git a/sources/mimeogram/updaters.py b/sources/mimeogram/updaters.py index b49e3fb..556ebaf 100644 --- a/sources/mimeogram/updaters.py +++ b/sources/mimeogram/updaters.py @@ -182,7 +182,7 @@ async def _update_content_atomic( location: __.Path, content: str, charset: str = 'utf-8', - linesep: _parts.LineSeparators = _parts.LineSeparators.LF + linesep: __.detextive.LineSeparators = __.detextive.LineSeparators.LF ) -> None: ''' Updates file content atomically, if possible. ''' import aiofiles.os as os # noqa: PLR0402 diff --git a/tests/test_000_mimeogram/test_500_acquirers.py b/tests/test_000_mimeogram/test_500_acquirers.py index a24794a..3b17985 100644 --- a/tests/test_000_mimeogram/test_500_acquirers.py +++ b/tests/test_000_mimeogram/test_500_acquirers.py @@ -147,105 +147,6 @@ async def test_200_detect_line_endings( provide_tempdir, provide_auxdata ): assert part.content.count( '\n' ) == 2 -# Character Set Tests - -@pytest.mark.asyncio -async def test_300_detect_charset( provide_tempdir, provide_auxdata ): - ''' Successfully detects different character sets. ''' - acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - - # Create and populate test files - utf8_path = provide_tempdir / "utf8bom.txt" - ascii_path = provide_tempdir / "ascii.txt" - utf16_path = provide_tempdir / "utf16.txt" - latin1_path = provide_tempdir / "latin1.txt" - - # Write binary content - utf8_path.write_bytes(b'\xef\xbb\xbfHello, World!\n') # UTF-8 with BOM - ascii_path.write_bytes(b'Hello, World!\n') # ASCII content - utf16_path.write_bytes( - b'\xff\xfeH\x00e\x00l\x00l\x00o\x00!\x00\n\x00') # UTF-16 LE - latin1_path.write_bytes(b'Caf\xe9\n') # ISO-8859-1 / invalid UTF-8 - - try: - results = await acquirers.acquire( - provide_auxdata, [utf8_path, ascii_path, utf16_path, latin1_path] ) - - charsets = { part.charset.lower() for part in results } - assert 'utf-8' in charsets - assert 'utf-16' in charsets - assert 'iso-8859-9' in charsets or 'latin1' in charsets - finally: - for path in (utf8_path, ascii_path, utf16_path, latin1_path): - if path.exists(): - path.unlink() - - -# MIME Type Tests - -@pytest.mark.asyncio -async def test_400_detect_mime_types( provide_tempdir, provide_auxdata ): - ''' Successfully detects MIME types for different file types. ''' - acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - - test_files = { - "plain.txt": "Plain text\n", - "script.py": ( - "#!/usr/bin/env python3\n" - "from __future__ import annotations\n\n" - "def hello() -> str:\n return 'Python'\n" ), - # Test pattern-based detection for structured text formats - "config.toml": "[package]\nname = 'test'\n", - "data.yaml": "key: value\nlist:\n - item1\n", - "service.json": '{"name": "test", "version": "1.0"}\n', - "manifest.xml": ( - 'test\n' ), - "rust_code.rs": 'fn main() { println!("Hello, world!"); }\n', - } - - with create_test_files( provide_tempdir, test_files ): - results = await acquirers.acquire( provide_auxdata, [ - provide_tempdir / "plain.txt", - provide_tempdir / "script.py", - provide_tempdir / "config.toml", - provide_tempdir / "data.yaml", - provide_tempdir / "service.json", - provide_tempdir / "manifest.xml", - provide_tempdir / "rust_code.rs", - ] ) - - assert len( results ) == 7 - mimetypes = { part.mimetype for part in results } - - # Existing assertions - assert "text/plain" in mimetypes - assert any( "python" in mt for mt in mimetypes ) - - # Pattern-based detection assertions for recognized MIME types - assert any( - mt.endswith( '+json' ) or 'json' in mt for mt in mimetypes ) - assert any( - mt.endswith( '+xml' ) or 'xml' in mt for mt in mimetypes ) - - # TOML and YAML files should be accepted via charset fallback - # since Python's mimetypes doesn't recognize them - toml_results = [ - p for p in results if p.location.endswith( 'config.toml' ) ] - yaml_results = [ - p for p in results if p.location.endswith( 'data.yaml' ) ] - assert len( toml_results ) == 1 - assert len( yaml_results ) == 1 - - # Rust files should be accepted (regression test for original issue) - rust_results = [ - p for p in results if p.location.endswith( 'rust_code.rs' ) ] - assert len( rust_results ) == 1 - # Platform-agnostic: accept any textual MIME type for Rust files - rust_mimetype = rust_results[ 0 ].mimetype - assert \ - ( rust_mimetype.startswith( 'text/' ) - or rust_mimetype.startswith( 'application/' ) - ), f"Rust file should have textual MIME type, got: {rust_mimetype}" @pytest.mark.asyncio From 14d6b654dda3230f68ca2946c962ac8af364c48b Mon Sep 17 00:00:00 2001 From: Eric McDonald <221418+emcd@users.noreply.github.com> Date: Thu, 14 Aug 2025 13:55:59 -0700 Subject: [PATCH 2/2] Drop local textual detection --- sources/mimeogram/acquirers.py | 17 +++--- sources/mimeogram/exceptions.py | 8 --- sources/mimeogram/parts.py | 4 +- .../test_000_mimeogram/test_100_exceptions.py | 12 ---- tests/test_000_mimeogram/test_110_parts.py | 59 +++++++++---------- tests/test_000_mimeogram/test_200_parsers.py | 15 +++-- .../test_000_mimeogram/test_210_formatters.py | 5 +- .../test_320_differences.py | 13 ++-- .../test_330_interactions.py | 17 +++--- .../test_000_mimeogram/test_500_acquirers.py | 25 ++++---- tests/test_000_mimeogram/test_510_updaters.py | 23 ++++---- tests/test_000_mimeogram/test_610_apply.py | 7 ++- 12 files changed, 94 insertions(+), 111 deletions(-) diff --git a/sources/mimeogram/acquirers.py b/sources/mimeogram/acquirers.py index 471858a..4e95ad4 100644 --- a/sources/mimeogram/acquirers.py +++ b/sources/mimeogram/acquirers.py @@ -82,7 +82,7 @@ async def _acquire_from_file( location: __.Path ) -> _parts.Part: content_bytes, location ) if not __.detextive.is_textual_mimetype( mimetype ): - raise _exceptions.TextualMimetypeInvalidity( + raise __.detextive.exceptions.TextualMimetypeInvalidity( str( location ), mimetype ) if charset is None: raise ContentDecodeFailure( location, '???' ) @@ -120,16 +120,13 @@ async def _acquire_via_http( ) if charset is None: raise ContentDecodeFailure( url, '???' ) if not __.detextive.is_textual_mimetype( mimetype ): - try: - mimetype, _ = __.detextive.detect_mimetype_and_charset( - content_bytes, url, charset = charset - ) - except __.detextive.exceptions.TextualMimetypeInvalidity as exc: - raise _exceptions.TextualMimetypeInvalidity( - url, mimetype - ) from exc + mimetype, _ = __.detextive.detect_mimetype_and_charset( + content_bytes, url, charset = charset + ) if not __.detextive.is_textual_mimetype( mimetype ): - raise _exceptions.TextualMimetypeInvalidity( url, mimetype ) + raise __.detextive.exceptions.TextualMimetypeInvalidity( + url, mimetype + ) linesep = __.detextive.LineSeparators.detect_bytes( content_bytes ) if linesep is None: _scribe.warning( f"No line separator detected in '{url}'." ) diff --git a/sources/mimeogram/exceptions.py b/sources/mimeogram/exceptions.py index 12637f7..d7628e7 100644 --- a/sources/mimeogram/exceptions.py +++ b/sources/mimeogram/exceptions.py @@ -114,14 +114,6 @@ def __init__( self, species: str ): super( ).__init__( f"Could not discover valid {species}." ) -class TextualMimetypeInvalidity( Omnierror ): - ''' Invalid textual MIME type for content at location. ''' - - def __init__( self, location: str | __.Path, mimetype: str ): - super( ).__init__( - f"Invalid MIME type '{mimetype}' for content at '{location}'." ) - - class TokenizerVariantInvalidity( Omnierror ): ''' Invalid tokenizer variant. ''' diff --git a/sources/mimeogram/parts.py b/sources/mimeogram/parts.py index cd131d2..656dd91 100644 --- a/sources/mimeogram/parts.py +++ b/sources/mimeogram/parts.py @@ -24,8 +24,6 @@ from . import __ from . import fsprotect as _fsprotect -LineSeparators = __.detextive.LineSeparators - class Resolutions( __.enum.Enum ): ''' Available resolutions for each part. ''' @@ -39,7 +37,7 @@ class Part( __.immut.DataclassObject ): location: str # TODO? 'Url' class mimetype: str charset: str - linesep: "LineSeparators" + linesep: __.detextive.LineSeparators content: str # TODO? 'format' method diff --git a/tests/test_000_mimeogram/test_100_exceptions.py b/tests/test_000_mimeogram/test_100_exceptions.py index 8ded474..c50bf21 100644 --- a/tests/test_000_mimeogram/test_100_exceptions.py +++ b/tests/test_000_mimeogram/test_100_exceptions.py @@ -121,18 +121,6 @@ def test_070_mimeogram_format_failures( ): assert reason in str( exc ) -def test_080_mimetype_failures( ): - ''' MIME type failure exceptions. ''' - exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) - - location = Path( '/test/path' ) - mimetype = 'application/octet-stream' - exc = exceptions.TextualMimetypeInvalidity( location, mimetype ) - assert isinstance( exc, exceptions.Omnierror ) - assert str( location ) in str( exc ) - assert mimetype in str( exc ) - - def test_090_program_absence( ): ''' Program absence error exceptions. ''' exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) diff --git a/tests/test_000_mimeogram/test_110_parts.py b/tests/test_000_mimeogram/test_110_parts.py index 68925b5..0493068 100644 --- a/tests/test_000_mimeogram/test_110_parts.py +++ b/tests/test_000_mimeogram/test_110_parts.py @@ -23,21 +23,20 @@ from . import PACKAGE_NAME, cache_import_module +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + def test_000_line_separators_enum( ): ''' Line separator enum values and attributes. ''' - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) - # Check enum values - assert parts.LineSeparators.CR.value == '\r' - assert parts.LineSeparators.CRLF.value == '\r\n' - assert parts.LineSeparators.LF.value == '\n' + assert LineSeparators.CR.value == '\r' + assert LineSeparators.CRLF.value == '\r\n' + assert LineSeparators.LF.value == '\n' def test_010_line_separators_detection( ): ''' Line separator detection from bytes. ''' - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) - # Test detection of different line separators cr_bytes = b'line1\rline2\rline3' crlf_bytes = b'line1\r\nline2\r\nline3' @@ -48,44 +47,42 @@ def test_010_line_separators_detection( ): no_terminator_bytes = b'line1line2line3' assert ( - parts.LineSeparators.detect_bytes( cr_bytes ) - == parts.LineSeparators.CR ) + LineSeparators.detect_bytes( cr_bytes ) + == LineSeparators.CR ) assert ( - parts.LineSeparators.detect_bytes( crlf_bytes ) - == parts.LineSeparators.CRLF ) + LineSeparators.detect_bytes( crlf_bytes ) + == LineSeparators.CRLF ) assert ( - parts.LineSeparators.detect_bytes( lf_bytes ) - == parts.LineSeparators.LF ) + LineSeparators.detect_bytes( lf_bytes ) + == LineSeparators.LF ) # With mixed bytes, it detects the first encountered line separator assert ( - parts.LineSeparators.detect_bytes( mixed_bytes ) - == parts.LineSeparators.CR ) + LineSeparators.detect_bytes( mixed_bytes ) + == LineSeparators.CR ) # Double CR case assert ( - parts.LineSeparators.detect_bytes( double_cr_bytes ) - == parts.LineSeparators.CR ) + LineSeparators.detect_bytes( double_cr_bytes ) + == LineSeparators.CR ) # Empty bytes and bytes without terminators - assert parts.LineSeparators.detect_bytes( empty_bytes ) is None - assert parts.LineSeparators.detect_bytes( no_terminator_bytes ) is None + assert LineSeparators.detect_bytes( empty_bytes ) is None + assert LineSeparators.detect_bytes( no_terminator_bytes ) is None def test_020_line_separators_normalization( ): ''' Line separator normalization methods. ''' - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) - # Test universal normalization mixed_content = "line1\rline2\r\nline3\n" - normalized = parts.LineSeparators.normalize_universal( mixed_content ) + normalized = LineSeparators.normalize_universal( mixed_content ) assert normalized == "line1\nline2\nline3\n" # Test specific separator nativization and normalization cr_content = "line1\rline2\rline3" - lf_sep = parts.LineSeparators.LF - cr_sep = parts.LineSeparators.CR - crlf_sep = parts.LineSeparators.CRLF + lf_sep = LineSeparators.LF + cr_sep = LineSeparators.CR + crlf_sep = LineSeparators.CRLF # Test LF nativization (no change) assert lf_sep.nativize( cr_content ) == cr_content @@ -112,7 +109,7 @@ def test_100_part_immutability( ): location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'test content' ) @@ -124,7 +121,7 @@ def test_100_part_immutability( ): with pytest.raises( AttributeImmutability ): part.charset = 'ascii' with pytest.raises( AttributeImmutability ): - part.linesep = parts.LineSeparators.CRLF + part.linesep = LineSeparators.CRLF with pytest.raises( AttributeImmutability ): part.content = 'new content' @@ -138,7 +135,7 @@ def test_110_part_creation( ): location = '/path/to/file.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'Sample text content' ) @@ -146,7 +143,7 @@ def test_110_part_creation( ): assert part_1.location == '/path/to/file.txt' assert part_1.mimetype == 'text/plain' assert part_1.charset == 'utf-8' - assert part_1.linesep == parts.LineSeparators.LF + assert part_1.linesep == LineSeparators.LF assert part_1.content == 'Sample text content' # Test with URL location @@ -154,11 +151,11 @@ def test_110_part_creation( ): location = 'https://example.com/data.txt', mimetype = 'text/csv', charset = 'ascii', - linesep = parts.LineSeparators.CRLF, + linesep = LineSeparators.CRLF, content = 'header,value\n1,2\n' ) assert part_2.location == 'https://example.com/data.txt' assert part_2.mimetype == 'text/csv' assert part_2.charset == 'ascii' - assert part_2.linesep == parts.LineSeparators.CRLF + assert part_2.linesep == LineSeparators.CRLF diff --git a/tests/test_000_mimeogram/test_200_parsers.py b/tests/test_000_mimeogram/test_200_parsers.py index 53f37ee..2fc3774 100644 --- a/tests/test_000_mimeogram/test_200_parsers.py +++ b/tests/test_000_mimeogram/test_200_parsers.py @@ -25,6 +25,9 @@ from . import PACKAGE_NAME, cache_import_module +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + def _create_sample_mimeogram( location = 'test.txt', @@ -61,7 +64,7 @@ def test_000_basic_parse( ): assert first_part.location == 'test.txt' assert first_part.mimetype == 'text/plain' assert first_part.charset == 'utf-8' - assert first_part.linesep == parts.LineSeparators.LF + assert first_part.linesep == LineSeparators.LF assert first_part.content == 'Sample content' @@ -95,8 +98,6 @@ def test_010_parse_multiple_parts( ): def test_020_parse_part_details( ): ''' Details of a single part. ''' parsers = cache_import_module( f"{PACKAGE_NAME}.parsers" ) - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) - # Create mimeogram with detailed headers mimeogram_text = ( "--====MIMEOGRAM_0123456789abcdef====\n" @@ -113,7 +114,7 @@ def test_020_parse_part_details( ): assert first_part.location == 'detailed.txt' assert first_part.mimetype == 'application/json' assert first_part.charset == 'utf-8' - assert first_part.linesep == parts.LineSeparators.CRLF + assert first_part.linesep == LineSeparators.CRLF assert first_part.content == '{"key": "value"}' @@ -204,12 +205,10 @@ def test_070_unicode_content( ): def test_080_line_separator_variations( ): ''' Mimeograms with different line separators. ''' parsers = cache_import_module( f"{PACKAGE_NAME}.parsers" ) - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) - # Create test cases for LF and CRLF line separators separators = [ - ('\n', parts.LineSeparators.LF), - ('\r\n', parts.LineSeparators.CRLF) + ('\n', LineSeparators.LF), + ('\r\n', LineSeparators.CRLF) ] for sep, expected_type in separators: diff --git a/tests/test_000_mimeogram/test_210_formatters.py b/tests/test_000_mimeogram/test_210_formatters.py index 6036918..bb12081 100644 --- a/tests/test_000_mimeogram/test_210_formatters.py +++ b/tests/test_000_mimeogram/test_210_formatters.py @@ -26,6 +26,9 @@ from . import PACKAGE_NAME, cache_import_module +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + def _create_sample_part( location = 'test.txt', @@ -39,7 +42,7 @@ def _create_sample_part( location = location, mimetype = mimetype, charset = charset, - linesep = parts.LineSeparators[ linesep ], + linesep = LineSeparators[ linesep ], content = content ) diff --git a/tests/test_000_mimeogram/test_320_differences.py b/tests/test_000_mimeogram/test_320_differences.py index f34aa71..d083763 100644 --- a/tests/test_000_mimeogram/test_320_differences.py +++ b/tests/test_000_mimeogram/test_320_differences.py @@ -27,6 +27,9 @@ from . import PACKAGE_NAME, cache_import_module, create_test_files +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + class MockDisplay: ''' Simple display implementation for testing. ''' @@ -72,7 +75,7 @@ async def test_100_select_segments_empty_revision( provide_tempdir ): location = str( test_path ), mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = current ) target = parts.Target( part = part, @@ -104,7 +107,7 @@ async def test_110_select_segments_with_changes( provide_tempdir ): location = str( test_path ), mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = current ) target = parts.Target( part = part, @@ -139,7 +142,7 @@ async def test_120_select_segments_reject_changes( provide_tempdir ): location = str( test_path ), mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = current ) target = parts.Target( part = part, @@ -176,7 +179,7 @@ async def test_130_select_segments_multiple_changes( location = str( test_path ), mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = current ) target = parts.Target( part = part, @@ -289,7 +292,7 @@ async def __call__( self, lines ): location = str( test_path ), mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = current ) target = parts.Target( part = part, diff --git a/tests/test_000_mimeogram/test_330_interactions.py b/tests/test_000_mimeogram/test_330_interactions.py index c170d66..f2517f9 100644 --- a/tests/test_000_mimeogram/test_330_interactions.py +++ b/tests/test_000_mimeogram/test_330_interactions.py @@ -25,6 +25,9 @@ from . import PACKAGE_NAME, cache_import_module +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + def test_100_calculate_differences( ): ''' Difference calculation handles various content cases. ''' @@ -35,7 +38,7 @@ def test_100_calculate_differences( ): location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "" ) # Empty both @@ -70,7 +73,7 @@ async def mock_selector( *args ): return "test content" location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, @@ -110,7 +113,7 @@ async def mock_selector( *args ): return "test content" location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, @@ -151,7 +154,7 @@ async def mock_selector( *args ): return "test content" location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, @@ -202,7 +205,7 @@ async def mock_differences_display( *args ): location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, @@ -249,7 +252,7 @@ async def mock_selector( *args ): location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, @@ -291,7 +294,7 @@ async def mock_selector( *args ): return "test content" location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) target = parts.Target( part = part, diff --git a/tests/test_000_mimeogram/test_500_acquirers.py b/tests/test_000_mimeogram/test_500_acquirers.py index 3b17985..182638a 100644 --- a/tests/test_000_mimeogram/test_500_acquirers.py +++ b/tests/test_000_mimeogram/test_500_acquirers.py @@ -34,6 +34,12 @@ produce_test_environment, ) +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators +DetextiveTextualMimetypeInvalidity = ( + __.detextive.exceptions.TextualMimetypeInvalidity +) + @pytest.fixture def provide_auxdata( provide_tempdir, provide_tempenv ): @@ -127,7 +133,6 @@ async def test_120_acquire_recursive_directory( async def test_200_detect_line_endings( provide_tempdir, provide_auxdata ): ''' Successfully detects and normalizes different line endings. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - parts = cache_import_module( f"{PACKAGE_NAME}.parts" ) test_files = { "unix.txt": "line1\nline2\n", # LF "windows.txt": "line1\r\nline2\r\n", # CRLF @@ -139,8 +144,7 @@ async def test_200_detect_line_endings( provide_tempdir, provide_auxdata ): ] ) assert len( results ) == 2 lineseps = { part.linesep for part in results } - assert lineseps == { - parts.LineSeparators.LF, parts.LineSeparators.CRLF } + assert lineseps == { LineSeparators.LF, LineSeparators.CRLF } # All content should be normalized to LF for part in results: assert part.content.count( '\r\n' ) == 0 @@ -207,7 +211,6 @@ async def test_500_invalid_file( provide_tempdir, provide_auxdata ): ''' Properly handles missing files. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) - nonexistent = provide_tempdir / "nonexistent.txt" with pytest.raises( exceptions.ContentAcquireFailure ) as excinfo: await acquirers.acquire( provide_auxdata, [ nonexistent ] ) @@ -220,7 +223,6 @@ async def test_510_unsupported_scheme( provide_auxdata ): ''' Properly handles unsupported URL schemes. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) - with pytest.raises( exceptions.UrlSchemeNoSupport ) as excinfo: await acquirers.acquire( provide_auxdata, [ "ftp://example.com/file.txt" ] ) @@ -234,8 +236,6 @@ async def test_520_nontextual_mime( provide_tempdir, provide_auxdata ): modes. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) - binary_path = provide_tempdir / 'binary.bin' binary_path.write_bytes( bytes( [ 0xFF, 0x00 ] * 128 ) ) @@ -249,7 +249,7 @@ async def test_520_nontextual_mime( provide_tempdir, provide_auxdata ): assert len( excinfo.value.exceptions ) == 1 assert isinstance( excinfo.value.exceptions[ 0 ], - exceptions.TextualMimetypeInvalidity ) + DetextiveTextualMimetypeInvalidity ) err_msg = str( excinfo.value.exceptions[ 0 ] ) assert str( binary_path ) in err_msg assert 'application/octet-stream' in err_msg @@ -321,7 +321,6 @@ async def test_525_charset_fallback_validation( async def test_530_strict_mode_handling( provide_tempdir, provide_auxdata ): ''' Tests strict mode handling of invalid files. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) test_files = { 'valid.txt': 'Valid text content\n', @@ -345,7 +344,7 @@ async def test_530_strict_mode_handling( provide_tempdir, provide_auxdata ): assert len( excinfo.value.exceptions ) == 1 assert isinstance( excinfo.value.exceptions[ 0 ], - exceptions.TextualMimetypeInvalidity ) + DetextiveTextualMimetypeInvalidity ) # Test non-strict mode provide_auxdata.configuration[ @@ -368,7 +367,6 @@ async def test_540_strict_mode_multiple_failures( ): ''' Tests strict mode handling of multiple invalid files. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) test_files = { 'valid.txt': 'Valid text content\n', @@ -395,7 +393,7 @@ async def test_540_strict_mode_multiple_failures( assert len( excinfo.value.exceptions ) == 2 for exc in excinfo.value.exceptions: - assert isinstance( exc, exceptions.TextualMimetypeInvalidity ) + assert isinstance( exc, DetextiveTextualMimetypeInvalidity ) # Test non-strict mode provide_auxdata.configuration[ @@ -560,7 +558,6 @@ async def test_620_http_nontextual_mimetype( provide_auxdata, httpx_mock ): and non-strict modes. ''' acquirers = cache_import_module( f"{PACKAGE_NAME}.acquirers" ) - exceptions = cache_import_module( f"{PACKAGE_NAME}.exceptions" ) test_url = 'https://example.com/test.bin' httpx_mock.add_response( @@ -579,7 +576,7 @@ async def test_620_http_nontextual_mimetype( provide_auxdata, httpx_mock ): assert len( excinfo.value.exceptions ) == 1 assert isinstance( excinfo.value.exceptions[ 0 ], - exceptions.TextualMimetypeInvalidity ) + DetextiveTextualMimetypeInvalidity ) assert test_url in str( excinfo.value.exceptions[ 0 ] ) # Reset mock for non-strict mode test diff --git a/tests/test_000_mimeogram/test_510_updaters.py b/tests/test_000_mimeogram/test_510_updaters.py index b1bc165..aa0825d 100644 --- a/tests/test_000_mimeogram/test_510_updaters.py +++ b/tests/test_000_mimeogram/test_510_updaters.py @@ -38,6 +38,9 @@ create_test_files, ) +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + @dataclass( frozen = True ) class _TestProtector: @@ -117,7 +120,7 @@ async def test_100_update_simple_file( location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'updated content' ) @@ -146,7 +149,7 @@ async def test_110_update_skips_mimeogram_protocol( location = 'mimeogram://message', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'test content' ) @@ -176,7 +179,7 @@ async def test_120_update_respects_protection( location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'updated content' ) @@ -208,7 +211,7 @@ async def test_130_update_override_protections( location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'updated content' ) @@ -244,7 +247,7 @@ async def test_140_update_respects_interactor( provide_tempdir ): location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'test content' ) @@ -293,7 +296,7 @@ async def test_160_partitive_ignore_mode( provide_tempdir ): location = 'test.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'new content' ) @@ -333,14 +336,14 @@ async def test_170_queue_and_reverter_rollback_on_error( location = 'file1.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts_mod.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'file1 updated' ) part2 = parts_mod.Part( location = 'file2.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts_mod.LineSeparators.LF, + linesep = LineSeparators.LF, content = 'file2 updated' ) @@ -384,7 +387,7 @@ async def test_180_line_endings_preserved( provide_tempdir ): location = 'test_windows.txt', mimetype = 'text/plain', charset = 'utf-8', - linesep = parts_mod.LineSeparators.CRLF, + linesep = LineSeparators.CRLF, content = 'line1\r\nline2\r\nline3\r\n' ) @@ -413,7 +416,7 @@ async def test_190_reverter_direct_coverage( provide_tempdir ): location = str( nonexistent_path ), mimetype = 'text/plain', charset = 'utf-8', - linesep = parts_mod.LineSeparators.LF, + linesep = LineSeparators.LF, content = '' ) # 1) Non-existent => skip saving diff --git a/tests/test_000_mimeogram/test_610_apply.py b/tests/test_000_mimeogram/test_610_apply.py index b970fcd..d8a3ae0 100644 --- a/tests/test_000_mimeogram/test_610_apply.py +++ b/tests/test_000_mimeogram/test_610_apply.py @@ -27,6 +27,9 @@ from . import PACKAGE_NAME, cache_import_module +__ = cache_import_module( f"{PACKAGE_NAME}.__" ) +LineSeparators = __.detextive.LineSeparators + @pytest.mark.usefixtures( ) class MockContentAcquirer: @@ -209,7 +212,7 @@ async def test_400_apply_success( ): location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) ] async def mock_updater( @@ -284,7 +287,7 @@ async def test_430_apply_update_failure( ): location = "test.txt", mimetype = "text/plain", charset = "utf-8", - linesep = parts.LineSeparators.LF, + linesep = LineSeparators.LF, content = "test content" ) ] async def failing_updater( auxdata, parts, **kwargs ):