Skip to main content

gimli/
case_fold.rs

1include!("case_fold_data.rs");
2
3fn case_fold_data(c: char) -> char {
4    match CASE_FOLD_DATA.binary_search_by(|&(key, _)| key.cmp(&c)) {
5        Ok(i) => CASE_FOLD_DATA[i].1,
6        Err(_) => c,
7    }
8}
9
10/// Perform case folding for the DWARF name index hashing.
11///
12/// This implements the case folding specified in DWARF 5 Section 6.1.1.4.5.
13///
14/// "The simple case folding algorithm is further described in the CaseFolding.txt file
15/// distributed with the Unicode Character Database. That file defines four classes of
16/// mappings: Common (C), Simple (S), Full (F), and Turkish (T). The hash
17/// computation specified here uses the C + S mappings only, which do not affect the
18/// total length of the string, with the addition that Turkish upper case dotted ’İ’ and
19/// lower case dotless ’ı’ are folded to the Latin lower case ’i’."
20pub fn case_fold(c: char) -> char {
21    if c.is_ascii() {
22        (c as u8).to_ascii_lowercase() as char
23    } else {
24        case_fold_data(c)
25    }
26}
27
28/// Calculate a case folding DJB hash for the DWARF name index.
29///
30/// This uses the case folding specified in DWARF 5 Section 6.1.1.4.5
31/// with the DJB hash specified in DWARF 5 Section 7.33.
32pub fn case_folding_djb_hash(s: &str) -> u32 {
33    let mut hash: u32 = 5381;
34    for c in s.chars() {
35        if c.is_ascii() {
36            let byte = (c as u8).to_ascii_lowercase();
37            hash = djb_hash_byte(hash, byte);
38        } else {
39            let c = case_fold_data(c);
40            let mut bytes = [0; 4];
41            for byte in c.encode_utf8(&mut bytes).as_bytes() {
42                hash = djb_hash_byte(hash, *byte);
43            }
44        }
45    }
46    hash
47}
48
49#[inline]
50fn djb_hash_byte(hash: u32, byte: u8) -> u32 {
51    hash.wrapping_mul(33).wrapping_add(u32::from(byte))
52}
53
54#[cfg(test)]
55mod tests {
56    use super::*;
57
58    #[test]
59    fn test_case_fold() {
60        for (c, fold) in [
61            ('A', 'a'), // ASCII
62            ('1', '1'), // Identity
63            ('Σ', 'σ'), // Greek
64            ('K', 'k'), // Kelvin
65            ('I', 'i'), // Latin capital letter I
66            // DWARF extension to simple case folding: Turkish upper case dotted ’İ’
67            // and lower case dotless ’ı’ are folded to the Latin lower case ’i’
68            ('İ', 'i'),
69            ('ı', 'i'),
70        ] {
71            assert_eq!(case_fold(c), fold);
72            assert_eq!(case_fold(fold), fold);
73        }
74    }
75
76    #[test]
77    fn test_case_folding_djb_hash() {
78        assert_eq!(case_folding_djb_hash(""), 5381);
79        // Test case copied from LLVM
80        let s = "İıÀàĀāĹĺЕеẦầKkⰝⱍMm𐲒𐳒";
81        assert_eq!(case_folding_djb_hash(s), 1145571043);
82    }
83}