Skip to content

Commit

Permalink
Add documentation of ucs2
Browse files Browse the repository at this point in the history
Signed-off-by: FedericoBruzzone <[email protected]>
  • Loading branch information
FedericoBruzzone committed Dec 18, 2023
1 parent 02a73b0 commit 144e178
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 38 deletions.
36 changes: 22 additions & 14 deletions src/bin/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,20 +20,20 @@ fn main() {
// let dec = utf8::utf8_decode(&enc_utf8);
// println!("UNICODE code point: {:x?}", dec);

let v: Vec<u32> = vec![0x10001]; // Array of code points in unicode
let utf8_vec: Vec<u8> = utf8::encode_in_utf8(&v);
// println!("v encoded utf-8: {:x?}", utf8_vec);
let enc_ucs2 = ucs2::ucs2_encode(&v);
// println!("v encoded ucs-2: {:x?}", enc_ucs2);
let unicode_vec_from_ucs2 = ucs2::ucs2_decode(&enc_ucs2);
// println!("UNICODE code point from ucs-2: {:x?}", unicode_vec_from_ucs2);
let unicode_vec_from_utf8 = utf8::decode_from_utf8(&utf8_vec);
println!(
"UNICODE code point from utf-8: {:x?}",
unicode_vec_from_utf8
);
utf8::print_utf8(&utf8_vec);
utf8::print_utf8_b(&utf8_vec);
// let v: Vec<u32> = vec![0x10001]; // Array of code points in unicode
// let utf8_vec: Vec<u8> = utf8::encode_in_utf8(&v);
// // println!("v encoded utf-8: {:x?}", utf8_vec);
// let enc_ucs2 = ucs2::encode_in_ucs2(&v);
// // println!("v encoded ucs-2: {:x?}", enc_ucs2);
// let unicode_vec_from_ucs2 = ucs2::decode_from_ucs2(&enc_ucs2);
// // println!("UNICODE code point from ucs-2: {:x?}", unicode_vec_from_ucs2);
// let unicode_vec_from_utf8 = utf8::decode_from_utf8(&utf8_vec);
// println!(
// "UNICODE code point from utf-8: {:x?}",
// unicode_vec_from_utf8
// );
// utf8::print_utf8(&utf8_vec);
// utf8::print_utf8_b(&utf8_vec);

println!("UTF8 ------------------------------------");

Expand Down Expand Up @@ -69,6 +69,14 @@ fn main() {

unicode::print_unicode_b(&unicode_vec_from_utf16);

println!("UCS2 ------------------------------------");
let v2 = vec![0xFFEE];
// let v = vec![0xD800, 0xDC00];
let ucs2_vec = ucs2::encode_in_ucs2(&v2);
let unicode_vec_from_utf16 = ucs2::decode_from_ucs2(&ucs2_vec);
utf16::print_utf16_b(&ucs2_vec);
unicode::print_unicode_b(&unicode_vec_from_utf16);

// let v2: Vec<u32> = vec![0x10001, 0x10002]; // Array of code point in unicode
// let enc2 = utf8::encode_in_utf8(&v2);
// utf8::print_utf8(&enc2);
Expand Down
168 changes: 145 additions & 23 deletions src/ucs2/ucs2.rs
Original file line number Diff line number Diff line change
@@ -1,47 +1,169 @@
/*!
UCS-2 encoding and decoding.
UCS-2 (Universal Character Set 2) is a character encoding that represents each
character in the Unicode character set with a fixed-size 16-bit code unit.
It's important to note that UCS-2 is limited to the Basic Multilingual Plane
(BMP) of Unicode, which includes character codes from U+0000 to U+FFFF.
Characters outside this range require more than 16 bits and are represented
using UTF-16 instead.
# Encoding
A unicode code point is represented using [two bytes]() in UCS-2, using always this fixed size.
# Decoding
A UCS-2 code point is decoded into a unicode code point using the the first [two bytes]().
## Representation
**Note**:
* UCS-2 is a subset of UTF-16.
* UCS-2 is capable of ending 65,536 code points. This is the same as the first 65,536 code points of UTF-16.
### Two bytes
**Encoding**: If the unicode code point is less than 0xFFFF, the unicode code point is represented in UTF-16 using only the 16 least significant bits.
**Decoding**: If the UTF-16 code point is less than 0xD800 or greater than 0xDBFF and less than 0xFFFF, the unicode code point is represented using only the 16 least significant bits.
* Unicode code point: `nnnnnnnn|nnnnnnnn|xxxxxxxx|xxxxxxxx`
* UTF-16 code point: `xxxxxxxx|xxxxxxxx`
*/

pub fn ucs2_encode<T: AsRef<Vec<u32>>>(v: T) -> Vec<u16> {
/// Pretty print the UCS-2 code points in hexadecimal, (binary) and decimal.
///
/// # Parameters
/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
/// * `binary_flag`: [`bool`] - A flag to print the binary representation of the UCS-2 code points.
///
/// # Note
/// The bytes printed in hexadecimal are code points in UCS-2.
fn print_ucs2_vec<T: AsRef<Vec<u16>>>(ucs2_cp: T, binary_flag: bool) {
let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
let string_repr: String = String::from_utf16(&v).unwrap();
let binary_repr: Vec<String> = v.iter().map(|x| format!("{:08b}", x)).collect();
println!();
println!(
"--------------- UCS-2 of \"{}\" ---------------",
string_repr
);
println!("Hex: {:x?}", v);
if binary_flag {
println!("Bin: {:?}", binary_repr);
}
println!("Dec: {:?}", v);
println!(
"{}{}",
"-".repeat(44),
"-".repeat(string_repr.chars().count())
);
println!();
}

// ============================================================================
// ================================ Public API ================================
// ============================================================================
/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
///
/// # Parameters
/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
///
/// # Note
/// The bytes printed in hexadecimal are code points in UCS-2.
///
/// # Example
/// ```rust
/// use encdec::prelude::*;
/// let v: Vec<u16> = vec![0xFFEE];
/// ucs2::print_ucs2(&v);
/// ```
/// **Output**
/// ```text
/// --------------- UTF-16 encoding of "𐀁" ---------------
/// Hex: [0xFFEE]
/// Dec: [65518]
/// ------------------------------------------------------
pub fn print_ucs2<T: AsRef<Vec<u16>>>(utf2_cp: T) {
print_ucs2_vec(utf2_cp, false);
}

/// Pretty print the UCS-2 encoding in hexadecimal and decimal of a vector of UCS-2 code points.
///
/// # Parameters
/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
///
/// # Note
/// The bytes printed in hexadecimal are code points in UCS-2.
///
/// # Example
/// ```rust
/// use encdec::prelude::*;
/// let v: Vec<u16> = vec![0xFFEE];
/// ucs2::print_ucs2_b(&v);
/// ```
/// **Output**
/// ```text
/// --------------- UTF-16 encoding of "𐀀" ---------------
/// Hex: [0xFFEE]
/// Bin: ["1111111111101110"]
/// Dec: [65518]
/// ------------------------------------------------------
pub fn print_ucs2_b<T: AsRef<Vec<u16>>>(ucs2_cp: T) {
print_ucs2_vec(ucs2_cp, true);
}
/// Encode a vector of unicode code points into a vector of UCS-2 code points.
///
/// # Parameters
/// * `unicode_cp`: [`Vec<u32>`] - A vector of unicode code points.
///
/// # Returns
/// A [`Vec<u16>`] containing the UCS-2 code points.
///
/// # Panics
/// * If the input vector (`unicode_cp`) of unicode code points contains invalid unicode code points.
///
/// # Example
/// ```rust
/// use encdec::prelude::*;
/// let v: Vec<u32> = vec![0xFFEE]; // Array of code points in unicode
/// let enc: Vec<u16> = ucs2::encode_in_ucs2(&v);
/// assert_eq!(enc, vec![0xFFEE]);
/// ```
pub fn encode_in_ucs2<T: AsRef<Vec<u32>>>(unicode_cp: T) -> Vec<u16> {
let mut new_v: Vec<u16> = Vec::new();
let v: Vec<u32> = v.as_ref().to_vec();
let v: Vec<u32> = unicode_cp.as_ref().to_vec();
for i in 0..v.len() {
let mut code_point = v[i];
let code_point = v[i];
if code_point > 0xFFFF {
let extra = code_point - 0x10000;
new_v.push((((extra >> 10) & 0x3FF) + 0xD800) as u16);
code_point = 0xDC00 | code_point & 0x3FF;
panic!("Invalid UCS-2 sequence");
}
new_v.push(code_point as u16);
}
new_v
}

pub fn ucs2_decode<T: AsRef<Vec<u16>>>(v: T) -> Vec<u32> {
/// Decode a vector of UCS-2 code points into a vector of unicode code points.
///
/// # Parameters
/// * `ucs2_cp`: [`Vec<u16>`] - A vector of UCS-2 code points.
///
/// # Returns
/// A [`Vec<u32>`] containing the unicode code points.
///
/// # Panics
/// * If the input vector (`ucs2_cp`) of UCS-2 code points contains invalid UCS-2 code points.
///
/// # Example
/// ```rust
/// use encdec::prelude::*;
/// let v: Vec<u16> = vec![0xFFEE]; // Array of code points in UCS-2
/// let dec: Vec<u32> = ucs2::decode_from_ucs2(&v);
/// assert_eq!(dec, vec![0xFFEE]);
/// ```
pub fn decode_from_ucs2<T: AsRef<Vec<u16>>>(ucs2_cp: T) -> Vec<u32> {
let mut new_v: Vec<u32> = Vec::new();
let v: Vec<u16> = v.as_ref().to_vec();
let v: Vec<u16> = ucs2_cp.as_ref().to_vec();
let mut i = 0;
while i < v.len() {
let code_point = v[i];
if code_point >= 0xD800 && code_point <= 0xDBFF {
let extra = v[i + 1];
if (extra & 0xFC00) == 0xDC00 {
new_v.push(((((code_point & 0x3FF) << 10) + (extra & 0x3FF)) as u32) + 0x10000);
i += 1;
} else {
panic!("Invalid UCS-2 sequence");
}
} else {
new_v.push(code_point as u32);
panic!("Invalid UCS-2 sequence");
}
new_v.push(code_point as u32);
i += 1;
}
new_v
Expand Down
2 changes: 1 addition & 1 deletion src/utf8/utf8.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ A unicode code point is represented using one to four bytes in UTF-8, depending
* If the unicode code point is in the range `0x10000` to `0x10FFFF`, it is represented using [four bytes](#four-bytes).
# Decoding
A UTF-8 code point is decoded into a unicode code point using the following rules.
A UTF-8 code point is decoded into a unicode code point using the following rules:
* If the first bit of the UTF-8 code point is 0, the unicode code point is represented using [one byte](#one-byte).
* If the first three bits of the UTF-8 code point are 110, the unicode code point is represented using [two bytes](#two-bytes).
* If the first four bits of the UTF-8 code point are 1110, the unicode code point is represented using [three bytes](#three-bytes).
Expand Down

0 comments on commit 144e178

Please sign in to comment.