Add documentation of ucs2

Signed-off-by: FedericoBruzzone <[email protected]>
FedericoBruzzone · Dec 18, 2023 · 144e178 · 144e178
1 parent 02a73b0
commit 144e178
Show file tree

Hide file tree

Showing 3 changed files with 168 additions and 38 deletions.
diff --git a/src/bin/main.rs b/src/bin/main.rs
@@ -20,20 +20,20 @@ fn main() {
     // let dec = utf8::utf8_decode(&enc_utf8);
     // println!("UNICODE code point: {:x?}", dec);
 
-    let v: Vec<u32> = vec![0x10001]; // Array of code points in unicode
-    let utf8_vec: Vec<u8> = utf8::encode_in_utf8(&v);
-    // println!("v encoded utf-8: {:x?}", utf8_vec);
-    let enc_ucs2 = ucs2::ucs2_encode(&v);
-    // println!("v encoded ucs-2: {:x?}", enc_ucs2);
-    let unicode_vec_from_ucs2 = ucs2::ucs2_decode(&enc_ucs2);
-    // println!("UNICODE code point from ucs-2: {:x?}", unicode_vec_from_ucs2);
-    let unicode_vec_from_utf8 = utf8::decode_from_utf8(&utf8_vec);
-    println!(
-        "UNICODE code point from utf-8: {:x?}",
-        unicode_vec_from_utf8
-    );
-    utf8::print_utf8(&utf8_vec);
-    utf8::print_utf8_b(&utf8_vec);
+    // let v: Vec<u32> = vec![0x10001]; // Array of code points in unicode
+    // let utf8_vec: Vec<u8> = utf8::encode_in_utf8(&v);
+    // // println!("v encoded utf-8: {:x?}", utf8_vec);
+    // let enc_ucs2 = ucs2::encode_in_ucs2(&v);
+    // // println!("v encoded ucs-2: {:x?}", enc_ucs2);
+    // let unicode_vec_from_ucs2 = ucs2::decode_from_ucs2(&enc_ucs2);
+    // // println!("UNICODE code point from ucs-2: {:x?}", unicode_vec_from_ucs2);
+    // let unicode_vec_from_utf8 = utf8::decode_from_utf8(&utf8_vec);
+    // println!(
+    //     "UNICODE code point from utf-8: {:x?}",
+    //     unicode_vec_from_utf8
+    // );
+    // utf8::print_utf8(&utf8_vec);
+    // utf8::print_utf8_b(&utf8_vec);
 
     println!("UTF8 ------------------------------------");
 
@@ -69,6 +69,14 @@ fn main() {
 
     unicode::print_unicode_b(&unicode_vec_from_utf16);
 
+    println!("UCS2 ------------------------------------");
+    let v2 = vec![0xFFEE];
+    // let v = vec![0xD800, 0xDC00];
+    let ucs2_vec = ucs2::encode_in_ucs2(&v2);
+    let unicode_vec_from_utf16 = ucs2::decode_from_ucs2(&ucs2_vec);
+    utf16::print_utf16_b(&ucs2_vec);
+    unicode::print_unicode_b(&unicode_vec_from_utf16);
+
     // let v2: Vec<u32> = vec![0x10001, 0x10002]; // Array of code point in unicode
     // let enc2 = utf8::encode_in_utf8(&v2);
     // utf8::print_utf8(&enc2);

diff --git a/src/ucs2/ucs2.rs b/src/ucs2/ucs2.rs
@@ -1,47 +1,169 @@
 /*!
 UCS-2 encoding and decoding.
 
-UCS-2 (Universal Character Set 2) is a character encoding that represents each
-character in the Unicode character set with a fixed-size 16-bit code unit.
-It's important to note that UCS-2 is limited to the Basic Multilingual Plane
-(BMP) of Unicode, which includes character codes from U+0000 to U+FFFF.
-Characters outside this range require more than 16 bits and are represented
-using UTF-16 instead.
+# Encoding
+A unicode code point is represented using [two bytes]() in UCS-2, using always this fixed size.
 
+# Decoding
+A UCS-2 code point is decoded into a unicode code point using the the first [two bytes]().
+
+## Representation
+
+**Note**:
+
+* UCS-2 is a subset of UTF-16.
+* UCS-2 is capable of ending 65,536 code points. This is the same as the first 65,536 code points of UTF-16.
+
+### Two bytes
+
+**Encoding**: If the unicode code point is less than 0xFFFF, the unicode code point is represented in UTF-16 using only the 16 least significant bits.
+
+**Decoding**: If the UTF-16 code point is less than 0xD800 or greater than 0xDBFF and less than 0xFFFF, the unicode code point is represented using only the 16 least significant bits.
+
+* Unicode code point: `nnnnnnnn|nnnnnnnn|xxxxxxxx|xxxxxxxx`
+* UTF-16 code point: `xxxxxxxx|xxxxxxxx`
 */
 
-pub fn ucs2_encode<T: AsRef<Vec<u32>>>(v: T) -> Vec<u16> {
+/// Pretty print the UCS-2 code points in hexadecimal, (binary) and decimal.
+///
+/// # Parameters
+/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
+/// * `binary_flag`: [`bool`] - A flag to print the binary representation of the UCS-2 code points.
+///
+/// # Note
+/// The bytes printed in hexadecimal are code points in UCS-2.
+fn print_ucs2_vec<T: AsRef<Vec<u16>>>(ucs2_cp: T, binary_flag: bool) {
+    let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
+    let string_repr: String = String::from_utf16(&v).unwrap();
+    let binary_repr: Vec<String> = v.iter().map(|x| format!("{:08b}", x)).collect();
+    println!();
+    println!(
+        "--------------- UCS-2 of \"{}\" ---------------",
+        string_repr
+    );
+    println!("Hex: {:x?}", v);
+    if binary_flag {
+        println!("Bin: {:?}", binary_repr);
+    }
+    println!("Dec: {:?}", v);
+    println!(
+        "{}{}",
+        "-".repeat(44),
+        "-".repeat(string_repr.chars().count())
+    );
+    println!();
+}
+
+// ============================================================================
+// ================================ Public API ================================
+// ============================================================================
+/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
+///
+/// # Parameters
+/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
+///
+/// # Note
+/// The bytes printed in hexadecimal are code points in UCS-2.
+///
+/// # Example
+/// ```rust
+/// use encdec::prelude::*;
+/// let v: Vec<u16> = vec![0xFFEE];
+/// ucs2::print_ucs2(&v);
+/// ```
+/// **Output**
+/// ```text
+/// --------------- UTF-16 encoding of "𐀁" ---------------
+/// Hex: [0xFFEE]
+/// Dec: [65518]
+/// ------------------------------------------------------
+pub fn print_ucs2<T: AsRef<Vec<u16>>>(utf2_cp: T) {
+    print_ucs2_vec(utf2_cp, false);
+}
+
+/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
+///
+/// # Parameters
+/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
+///
+/// # Note
+/// The bytes printed in hexadecimal are code points in UCS-2.
+///
+/// # Example
+/// ```rust
+/// use encdec::prelude::*;
+/// let v: Vec<u16> = vec![0xFFEE];
+/// ucs2::print_ucs2_b(&v);
+/// ```
+/// **Output**
+/// ```text
+/// --------------- UTF-16 encoding of "𐀀" ---------------
+/// Hex: [0xFFEE]
+/// Bin: ["1111111111101110"]
+/// Dec: [65518]
+/// ------------------------------------------------------
+pub fn print_ucs2_b<T: AsRef<Vec<u16>>>(ucs2_cp: T) {
+    print_ucs2_vec(ucs2_cp, true);
+}
+/// Encode a vector of unicode code points into a vector of UCS-2 code points.
+///
+/// # Parameters
+/// * `unicode_cp`: [`Vec<u32>`] - A vector of unicode code points.
+///
+/// # Returns
+/// A [`Vec<u16>`] containing the UCS-2 code points.
+///
+/// # Panics
+/// * If the input vector (`unicode_cp`) of unicode code points contains invalid unicode code points.
+///
+/// # Example
+/// ```rust
+/// use encdec::prelude::*;
+/// let v: Vec<u32> = vec![0xFFEE]; // Array of code points in unicode
+/// let enc: Vec<u16> = ucs2::encode_in_ucs2(&v);
+/// assert_eq!(enc, vec![0xFFEE]);
+/// ```
+pub fn encode_in_ucs2<T: AsRef<Vec<u32>>>(unicode_cp: T) -> Vec<u16> {
     let mut new_v: Vec<u16> = Vec::new();
-    let v: Vec<u32> = v.as_ref().to_vec();
+    let v: Vec<u32> = unicode_cp.as_ref().to_vec();
     for i in 0..v.len() {
-        let mut code_point = v[i];
+        let code_point = v[i];
         if code_point > 0xFFFF {
-            let extra = code_point - 0x10000;
-            new_v.push((((extra >> 10) & 0x3FF) + 0xD800) as u16);
-            code_point = 0xDC00 | code_point & 0x3FF;
+            panic!("Invalid UCS-2 sequence");
         }
         new_v.push(code_point as u16);
     }
     new_v
 }
 
-pub fn ucs2_decode<T: AsRef<Vec<u16>>>(v: T) -> Vec<u32> {
+/// Decode a vector of UCS-2 code points into a vector of unicode code points.
+///
+/// # Parameters
+/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
+///
+/// # Returns
+/// A [`Vec<u32>`] containing the unicode code points.
+///
+/// # Panics
+/// * If the input vector (`ucs2_cp`) of UCS-2 code points contains invalid UCS-2 code points.
+///
+/// # Example
+/// ```rust
+/// use encdec::prelude::*;
+/// let v: Vec<u16> = vec![0xFFEE]; // Array of code points in UCS-2
+/// let dec: Vec<u32> = ucs2::decode_from_ucs2(&v);
+/// assert_eq!(dec, vec![0xFFEE]);
+/// ```
+pub fn decode_from_ucs2<T: AsRef<Vec<u16>>>(ucs2_cp: T) -> Vec<u32> {
     let mut new_v: Vec<u32> = Vec::new();
-    let v: Vec<u16> = v.as_ref().to_vec();
+    let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
     let mut i = 0;
     while i < v.len() {
         let code_point = v[i];
         if code_point >= 0xD800 && code_point <= 0xDBFF {
-            let extra = v[i + 1];
-            if (extra & 0xFC00) == 0xDC00 {
-                new_v.push(((((code_point & 0x3FF) << 10) + (extra & 0x3FF)) as u32) + 0x10000);
-                i += 1;
-            } else {
-                panic!("Invalid UCS-2 sequence");
-            }
-        } else {
-            new_v.push(code_point as u32);
+            panic!("Invalid UCS-2 sequence");
         }
+        new_v.push(code_point as u32);
         i += 1;
     }
     new_v

diff --git a/src/utf8/utf8.rs b/src/utf8/utf8.rs
@@ -9,7 +9,7 @@ A unicode code point is represented using one to four bytes in UTF-8, depending
 * If the unicode code point is in the range `0x10000` to `0x10FFFF`, it is represented using [four bytes](#four-bytes).
 
 # Decoding
-A UTF-8 code point is decoded into a unicode code point using the following rules.
+A UTF-8 code point is decoded into a unicode code point using the following rules:
 * If the first bit of the UTF-8 code point is 0, the unicode code point is represented using [one byte](#one-byte).
 * If the first three bits of the UTF-8 code point are 110, the unicode code point is represented using [two bytes](#two-bytes).
 * If the first four bits of the UTF-8 code point are 1110, the unicode code point is represented using [three bytes](#three-bytes).