gimli/case_fold.rs
1include!("case_fold_data.rs");
2
3fn case_fold_data(c: char) -> char {
4 match CASE_FOLD_DATA.binary_search_by(|&(key, _)| key.cmp(&c)) {
5 Ok(i) => CASE_FOLD_DATA[i].1,
6 Err(_) => c,
7 }
8}
9
10/// Perform case folding for the DWARF name index hashing.
11///
12/// This implements the case folding specified in DWARF 5 Section 6.1.1.4.5.
13///
14/// "The simple case folding algorithm is further described in the CaseFolding.txt file
15/// distributed with the Unicode Character Database. That file defines four classes of
16/// mappings: Common (C), Simple (S), Full (F), and Turkish (T). The hash
17/// computation specified here uses the C + S mappings only, which do not affect the
18/// total length of the string, with the addition that Turkish upper case dotted ’İ’ and
19/// lower case dotless ’ı’ are folded to the Latin lower case ’i’."
20pub fn case_fold(c: char) -> char {
21 if c.is_ascii() {
22 (c as u8).to_ascii_lowercase() as char
23 } else {
24 case_fold_data(c)
25 }
26}
27
28/// Calculate a case folding DJB hash for the DWARF name index.
29///
30/// This uses the case folding specified in DWARF 5 Section 6.1.1.4.5
31/// with the DJB hash specified in DWARF 5 Section 7.33.
32pub fn case_folding_djb_hash(s: &str) -> u32 {
33 let mut hash: u32 = 5381;
34 for c in s.chars() {
35 if c.is_ascii() {
36 let byte = (c as u8).to_ascii_lowercase();
37 hash = djb_hash_byte(hash, byte);
38 } else {
39 let c = case_fold_data(c);
40 let mut bytes = [0; 4];
41 for byte in c.encode_utf8(&mut bytes).as_bytes() {
42 hash = djb_hash_byte(hash, *byte);
43 }
44 }
45 }
46 hash
47}
48
49#[inline]
50fn djb_hash_byte(hash: u32, byte: u8) -> u32 {
51 hash.wrapping_mul(33).wrapping_add(u32::from(byte))
52}
53
54#[cfg(test)]
55mod tests {
56 use super::*;
57
58 #[test]
59 fn test_case_fold() {
60 for (c, fold) in [
61 ('A', 'a'), // ASCII
62 ('1', '1'), // Identity
63 ('Σ', 'σ'), // Greek
64 ('K', 'k'), // Kelvin
65 ('I', 'i'), // Latin capital letter I
66 // DWARF extension to simple case folding: Turkish upper case dotted ’İ’
67 // and lower case dotless ’ı’ are folded to the Latin lower case ’i’
68 ('İ', 'i'),
69 ('ı', 'i'),
70 ] {
71 assert_eq!(case_fold(c), fold);
72 assert_eq!(case_fold(fold), fold);
73 }
74 }
75
76 #[test]
77 fn test_case_folding_djb_hash() {
78 assert_eq!(case_folding_djb_hash(""), 5381);
79 // Test case copied from LLVM
80 let s = "İıÀàĀāĹĺЕеẦầKkⰝⱍMm𐲒𐳒";
81 assert_eq!(case_folding_djb_hash(s), 1145571043);
82 }
83}