demangle-rust.c (6620B)
1// SPDX-License-Identifier: GPL-2.0 2#include <string.h> 3#include "debug.h" 4 5#include "demangle-rust.h" 6 7/* 8 * Mangled Rust symbols look like this: 9 * 10 * _$LT$std..sys..fd..FileDesc$u20$as$u20$core..ops..Drop$GT$::drop::hc68340e1baa4987a 11 * 12 * The original symbol is: 13 * 14 * <std::sys::fd::FileDesc as core::ops::Drop>::drop 15 * 16 * The last component of the path is a 64-bit hash in lowercase hex, prefixed 17 * with "h". Rust does not have a global namespace between crates, an illusion 18 * which Rust maintains by using the hash to distinguish things that would 19 * otherwise have the same symbol. 20 * 21 * Any path component not starting with a XID_Start character is prefixed with 22 * "_". 23 * 24 * The following escape sequences are used: 25 * 26 * "," => $C$ 27 * "@" => $SP$ 28 * "*" => $BP$ 29 * "&" => $RF$ 30 * "<" => $LT$ 31 * ">" => $GT$ 32 * "(" => $LP$ 33 * ")" => $RP$ 34 * " " => $u20$ 35 * "'" => $u27$ 36 * "[" => $u5b$ 37 * "]" => $u5d$ 38 * "~" => $u7e$ 39 * 40 * A double ".." means "::" and a single "." means "-". 41 * 42 * The only characters allowed in the mangled symbol are a-zA-Z0-9 and _.:$ 43 */ 44 45static const char *hash_prefix = "::h"; 46static const size_t hash_prefix_len = 3; 47static const size_t hash_len = 16; 48 49static bool is_prefixed_hash(const char *start); 50static bool looks_like_rust(const char *sym, size_t len); 51static bool unescape(const char **in, char **out, const char *seq, char value); 52 53/* 54 * INPUT: 55 * sym: symbol that has been through BFD-demangling 56 * 57 * This function looks for the following indicators: 58 * 59 * 1. The hash must consist of "h" followed by 16 lowercase hex digits. 60 * 61 * 2. As a sanity check, the hash must use between 5 and 15 of the 16 possible 62 * hex digits. This is true of 99.9998% of hashes so once in your life you 63 * may see a false negative. The point is to notice path components that 64 * could be Rust hashes but are probably not, like "haaaaaaaaaaaaaaaa". In 65 * this case a false positive (non-Rust symbol has an important path 66 * component removed because it looks like a Rust hash) is worse than a 67 * false negative (the rare Rust symbol is not demangled) so this sets the 68 * balance in favor of false negatives. 69 * 70 * 3. There must be no characters other than a-zA-Z0-9 and _.:$ 71 * 72 * 4. There must be no unrecognized $-sign sequences. 73 * 74 * 5. There must be no sequence of three or more dots in a row ("..."). 75 */ 76bool 77rust_is_mangled(const char *sym) 78{ 79 size_t len, len_without_hash; 80 81 if (!sym) 82 return false; 83 84 len = strlen(sym); 85 if (len <= hash_prefix_len + hash_len) 86 /* Not long enough to contain "::h" + hash + something else */ 87 return false; 88 89 len_without_hash = len - (hash_prefix_len + hash_len); 90 if (!is_prefixed_hash(sym + len_without_hash)) 91 return false; 92 93 return looks_like_rust(sym, len_without_hash); 94} 95 96/* 97 * A hash is the prefix "::h" followed by 16 lowercase hex digits. The hex 98 * digits must comprise between 5 and 15 (inclusive) distinct digits. 99 */ 100static bool is_prefixed_hash(const char *str) 101{ 102 const char *end; 103 bool seen[16]; 104 size_t i; 105 int count; 106 107 if (strncmp(str, hash_prefix, hash_prefix_len)) 108 return false; 109 str += hash_prefix_len; 110 111 memset(seen, false, sizeof(seen)); 112 for (end = str + hash_len; str < end; str++) 113 if (*str >= '0' && *str <= '9') 114 seen[*str - '0'] = true; 115 else if (*str >= 'a' && *str <= 'f') 116 seen[*str - 'a' + 10] = true; 117 else 118 return false; 119 120 /* Count how many distinct digits seen */ 121 count = 0; 122 for (i = 0; i < 16; i++) 123 if (seen[i]) 124 count++; 125 126 return count >= 5 && count <= 15; 127} 128 129static bool looks_like_rust(const char *str, size_t len) 130{ 131 const char *end = str + len; 132 133 while (str < end) 134 switch (*str) { 135 case '$': 136 if (!strncmp(str, "$C$", 3)) 137 str += 3; 138 else if (!strncmp(str, "$SP$", 4) 139 || !strncmp(str, "$BP$", 4) 140 || !strncmp(str, "$RF$", 4) 141 || !strncmp(str, "$LT$", 4) 142 || !strncmp(str, "$GT$", 4) 143 || !strncmp(str, "$LP$", 4) 144 || !strncmp(str, "$RP$", 4)) 145 str += 4; 146 else if (!strncmp(str, "$u20$", 5) 147 || !strncmp(str, "$u27$", 5) 148 || !strncmp(str, "$u5b$", 5) 149 || !strncmp(str, "$u5d$", 5) 150 || !strncmp(str, "$u7e$", 5)) 151 str += 5; 152 else 153 return false; 154 break; 155 case '.': 156 /* Do not allow three or more consecutive dots */ 157 if (!strncmp(str, "...", 3)) 158 return false; 159 /* Fall through */ 160 case 'a' ... 'z': 161 case 'A' ... 'Z': 162 case '0' ... '9': 163 case '_': 164 case ':': 165 str++; 166 break; 167 default: 168 return false; 169 } 170 171 return true; 172} 173 174/* 175 * INPUT: 176 * sym: symbol for which rust_is_mangled(sym) returns true 177 * 178 * The input is demangled in-place because the mangled name is always longer 179 * than the demangled one. 180 */ 181void 182rust_demangle_sym(char *sym) 183{ 184 const char *in; 185 char *out; 186 const char *end; 187 188 if (!sym) 189 return; 190 191 in = sym; 192 out = sym; 193 end = sym + strlen(sym) - (hash_prefix_len + hash_len); 194 195 while (in < end) 196 switch (*in) { 197 case '$': 198 if (!(unescape(&in, &out, "$C$", ',') 199 || unescape(&in, &out, "$SP$", '@') 200 || unescape(&in, &out, "$BP$", '*') 201 || unescape(&in, &out, "$RF$", '&') 202 || unescape(&in, &out, "$LT$", '<') 203 || unescape(&in, &out, "$GT$", '>') 204 || unescape(&in, &out, "$LP$", '(') 205 || unescape(&in, &out, "$RP$", ')') 206 || unescape(&in, &out, "$u20$", ' ') 207 || unescape(&in, &out, "$u27$", '\'') 208 || unescape(&in, &out, "$u5b$", '[') 209 || unescape(&in, &out, "$u5d$", ']') 210 || unescape(&in, &out, "$u7e$", '~'))) { 211 pr_err("demangle-rust: unexpected escape sequence"); 212 goto done; 213 } 214 break; 215 case '_': 216 /* 217 * If this is the start of a path component and the next 218 * character is an escape sequence, ignore the 219 * underscore. The mangler inserts an underscore to make 220 * sure the path component begins with a XID_Start 221 * character. 222 */ 223 if ((in == sym || in[-1] == ':') && in[1] == '$') 224 in++; 225 else 226 *out++ = *in++; 227 break; 228 case '.': 229 if (in[1] == '.') { 230 /* ".." becomes "::" */ 231 *out++ = ':'; 232 *out++ = ':'; 233 in += 2; 234 } else { 235 /* "." becomes "-" */ 236 *out++ = '-'; 237 in++; 238 } 239 break; 240 case 'a' ... 'z': 241 case 'A' ... 'Z': 242 case '0' ... '9': 243 case ':': 244 *out++ = *in++; 245 break; 246 default: 247 pr_err("demangle-rust: unexpected character '%c' in symbol\n", 248 *in); 249 goto done; 250 } 251 252done: 253 *out = '\0'; 254} 255 256static bool unescape(const char **in, char **out, const char *seq, char value) 257{ 258 size_t len = strlen(seq); 259 260 if (strncmp(*in, seq, len)) 261 return false; 262 263 **out = value; 264 265 *in += len; 266 *out += 1; 267 268 return true; 269}