mod.rs - mozsearch

// Copyright 2015 Ted Mielczarek. See the COPYRIGHT

// file at the top-level directory of this distribution.

use crate::{FrameSymbolizer, FrameWalker, Module, SymbolError};

pub use crate::sym_file::types::*;

pub use parser::SymbolParser;

use std::fs::File;

use std::io::Read;

use std::ops::Deref;

use std::path::Path;

use tracing::trace;

mod parser;

mod types;

pub mod walker;

// # Sync VS Async

//

// There is both a sync and an async entry-point to the parser.

// The two impls should be essentially identical, except for how they

// read bytes from the input reader into our circular buffer.

//

//

// # Streaming

//

// This parser streams the input to avoid the need to materialize all of

// it into memory at once (symbol files can be a gigabyte!). As a result,

// we need to iteratively parse.

//

// We do this by repeatedly filling up a buffer with input and asking the

// parser to parse it. The parser will return how much of the input it

// consumed, which we can use to clear space in our buffer and to tell

// if it successfully consumed the whole input when the Reader runs dry.

//

//

// # Handling EOF / Capacity

//

// Having a fix-sized buffer has one fatal issue: if one atomic step

// of the parser needs more than this amount of data, then we won't

// be able to parse it.

//

// This can result in `buf` filling up and `buf.space()` becoming an

// empty slice. This in turn will make the reader yield 0 bytes, and

// we'll treat it like EOF and fail the parse. When this happens, we

// try to double the buffer's size and request more bytes. If we get

// more, hooray! If we don't, then it's a "real" EOF.

//

// The "atom" of our parser is a line, so we need our buffer to be able

// to fit any line. However we actually only have roughly

// *half* this value as our limit, as circular::Buffer will only

// `shift` the buffer's contents if over half of its capacity has been

// drained by `consume` -- and `space()` only grows when a `shift` happens.

//

// I have in fact seen 8kb function names from Rust (thanks generic combinators!)

// and 82kb function names from C++ (thanks 'auto' returns!), so we

// need a buffer size that can grow to at least 200KB. This is a *very* large

// amount to backshift repeatedly, so to keep this under control, we start

// with only a 10KB buffer, which is generous but tolerable.

//

// We should still have *SOME* limit on this to avoid nasty death spirals,

// so let's go with 2MB (MAX_BUFFER_CAPACITY), letting you have a horrifying 1MB symbol.

//

// But just *dying* when we hit this point is terrible, so lets have an

// extra layer of robustness: if we ever hit the limit, enter "panic recovery"

// and just start discarding bytes until we hit a newline. Then resume normal

// parsing. The net effect of this is that we just treat this one line as

// corrupt (because statistically it won't even be needed!).

// Allows for at least 80KB symbol names, at most 160KB symbol names (fuzzy because of circular).

static MAX_BUFFER_CAPACITY: usize = 1024 * 160;

static INITIAL_BUFFER_CAPACITY: usize = 1024 * 10;

impl SymbolFile {

    /// Parse a SymbolFile from the given Reader.

///

    /// Every time a chunk of the input is parsed, that chunk will

    /// be passed to `callback` to allow you to do something else

    /// with the data as it's streamed in (e.g. you can save the

    /// input to a cache).

///

    /// The reader is wrapped in a buffer reader so you shouldn't

    /// buffer the input yourself.

    pub fn parse<R: Read>(

        mut input_reader: R,

        mut callback: impl FnMut(&[u8]),

    ) -> Result<SymbolFile, SymbolError> {

        let mut buf = circular::Buffer::with_capacity(INITIAL_BUFFER_CAPACITY);

        let mut parser = SymbolParser::new();

        let mut fully_consumed = false;

        let mut tried_to_grow = false;

        let mut in_panic_recovery = false;

        let mut just_finished_recovering = false;

        let mut total_consumed = 0u64;

        loop {

            if in_panic_recovery {

                // PANIC RECOVERY MODE! DISCARD BYTES UNTIL NEWLINE.

                let input = buf.data();

                if let Some(new_line_idx) = input.iter().position(|&byte| byte == b'\n') {

                    // Hooray, we found a new line! Consume up to and including that, and resume.

                    let amount = new_line_idx + 1;

                    callback(&input[..amount]);

                    buf.consume(amount);

                    total_consumed += amount as u64;

                    // Back to normal!

                    in_panic_recovery = false;

                    fully_consumed = false;

                    just_finished_recovering = true;

                    parser.lines += 1;

                    trace!("RECOVERY: complete!");

                } else {

                    // No newline, discard everything

                    let amount = input.len();

                    callback(&input[..amount]);

                    buf.consume(amount);

                    total_consumed += amount as u64;

                    // If the next read returns 0 bytes, then that's a proper EOF!

                    fully_consumed = true;

            // Read the data in, and tell the circular buffer about the new data

            let size = input_reader.read(buf.space())?;

            buf.fill(size);

            if size == 0 {

                // If the reader returned no more bytes, this can be either mean

                // EOF or the buffer is out of capacity. There are a lot of cases

                // to consider, so let's go through them one at a time...

                if just_finished_recovering && !buf.data().is_empty() {

                    // We just finished PANIC RECOVERY, but there's still bytes in

                    // the buffer. Assume that is parseable and resume normal parsing

                    // (do nothing, fallthrough to normal path).

                } else if fully_consumed {

                    // Success! The last iteration cleared the buffer and we still got

                    // no more bytes, so that's a proper EOF with a complete parse!

                    return Ok(parser.finish());

                } else if !tried_to_grow {

                    // We still have some stuff in the buffer, assume this is because

                    // the buffer is full, and try to make it BIGGER and ask for more again.

                    let new_cap = buf.capacity().saturating_mul(2);

                    if new_cap > MAX_BUFFER_CAPACITY {

                        // TIME TO PANIC!!! This line is catastrophically big, just start

                        // discarding bytes until we hit a newline.

                        trace!("RECOVERY: discarding enormous line {}", parser.lines);

                        in_panic_recovery = true;

                        continue;

                    trace!("parser out of space? trying more ({}KB)", new_cap / 1024);

                    buf.grow(new_cap);

                    tried_to_grow = true;

                    continue;

                } else if total_consumed == 0 {

                    // We grew the buffer and still got no more bytes, so it's a proper EOF.

                    // But actually, we never consumed any bytes, so this is an empty file?

                    // Give a better error message for that.

                    return Err(SymbolError::ParseError(

                        "empty SymbolFile (probably something wrong with your debuginfo tooling?)",

0,

));

                } else {

                    // Ok give up, this input is just impossible.

                    return Err(SymbolError::ParseError(

                        "unexpected EOF during parsing of SymbolFile (or a line was too long?)",

                        parser.lines,

));

            } else {

                tried_to_grow = false;

            if in_panic_recovery {

                // Don't run the normal parser while we're still recovering!

                continue;

            just_finished_recovering = false;

            // Ask the parser to parse more of the input

            let input = buf.data();

            let consumed = parser.parse_more(input)?;

            total_consumed += consumed as u64;

            // Give the other consumer of this Reader a chance to use this data.

            callback(&input[..consumed]);

            // Remember for the next iteration if all the input was consumed.

            fully_consumed = input.len() == consumed;

            buf.consume(consumed);

    /// `parse` but async

    #[cfg(feature = "http")]

    pub async fn parse_async(

        mut response: reqwest::Response,

        mut callback: impl FnMut(&[u8]),

    ) -> Result<SymbolFile, SymbolError> {

        let mut chunk;

        let mut slice = &[][..];

        let mut input_reader = &mut slice;

        let mut buf = circular::Buffer::with_capacity(INITIAL_BUFFER_CAPACITY);

        let mut parser = SymbolParser::new();

        let mut fully_consumed = false;

        let mut tried_to_grow = false;

        let mut in_panic_recovery = false;

        let mut just_finished_recovering = false;

        let mut total_consumed = 0u64;

        loop {

            if in_panic_recovery {

                // PANIC RECOVERY MODE! DISCARD BYTES UNTIL NEWLINE.

                let input = buf.data();

                if let Some(new_line_idx) = input.iter().position(|&byte| byte == b'\n') {

                    // Hooray, we found a new line! Consume up to and including that, and resume.

                    let amount = new_line_idx + 1;

                    callback(&input[..amount]);

                    buf.consume(amount);

                    total_consumed += amount as u64;

                    // Back to normal!

                    in_panic_recovery = false;

                    fully_consumed = false;

                    just_finished_recovering = true;

                    parser.lines += 1;

                    trace!("PANIC RECOVERY: complete!");

                } else {

                    // No newline, discard everything

                    let amount = input.len();

                    callback(&input[..amount]);

                    buf.consume(amount);

                    total_consumed += amount as u64;

                    // If the next read returns 0 bytes, then that's a proper EOF!

                    fully_consumed = true;

            // Little rube-goldberg machine to stream the contents:

            // * get a chunk (Bytes) from the Response

            // * get its underlying slice

            // * then get a mutable reference to that slice

            // * then Read that mutable reference in our circular buffer

            // * when the slice runs out, get the next chunk and repeat

            if input_reader.is_empty() {

                chunk = response

                    .chunk()

                    .await

                    .map_err(|e| std::io::Error::new(std::io::ErrorKind::Other, e))?

                    .unwrap_or_default();

                slice = &chunk[..];

                input_reader = &mut slice;

            // Read the data in, and tell the circular buffer about the new data

            let size = input_reader.read(buf.space())?;

            buf.fill(size);

            if size == 0 {

                // If the reader returned no more bytes, this can be either mean

                // EOF or the buffer is out of capacity. There are a lot of cases

                // to consider, so let's go through them one at a time...

                if just_finished_recovering && !buf.data().is_empty() {

                    // We just finished PANIC RECOVERY, but there's still bytes in

                    // the buffer. Assume that is parseable and resume normal parsing

                    // (do nothing, fallthrough to normal path).

                } else if fully_consumed {

                    // Success! The last iteration cleared the buffer and we still got

                    // no more bytes, so that's a proper EOF with a complete parse!

                    return Ok(parser.finish());

                } else if !tried_to_grow {

                    // We still have some stuff in the buffer, assume this is because

                    // the buffer is full, and try to make it BIGGER and ask for more again.

                    let new_cap = buf.capacity().saturating_mul(2);

                    if new_cap > MAX_BUFFER_CAPACITY {

                        // TIME TO PANIC!!! This line is catastrophically big, just start

                        // discarding bytes until we hit a newline.

                        trace!("RECOVERY: discarding enormous line {}", parser.lines);

                        in_panic_recovery = true;

                        continue;

                    trace!("parser out of space? trying more ({}KB)", new_cap / 1024);

                    buf.grow(new_cap);

                    tried_to_grow = true;

                    continue;

                } else if total_consumed == 0 {

                    // We grew the buffer and still got no more bytes, so it's a proper EOF.

                    // But actually, we never consumed any bytes, so this is an empty file?

                    // Give a better error message for that.

                    return Err(SymbolError::ParseError(

                        "empty SymbolFile (probably something wrong with your debuginfo tooling?)",

0,

));

                } else {

                    // Ok give up, this input is just impossible.

                    return Err(SymbolError::ParseError(

                        "unexpected EOF during parsing of SymbolFile (or a line was too long?)",

                        parser.lines,

));

            } else {

                tried_to_grow = false;

            if in_panic_recovery {

                // Don't run the normal parser while we're still recovering!

                continue;

            just_finished_recovering = false;

            // Ask the parser to parse more of the input

            let input = buf.data();

            let consumed = parser.parse_more(input)?;

            total_consumed += consumed as u64;

            // Give the other consumer of this Reader a chance to use this data.

            callback(&input[..consumed]);

            // Remember for the next iteration if all the input was consumed.

            fully_consumed = input.len() == consumed;

            buf.consume(consumed);

    // Parse a SymbolFile from bytes.

    pub fn from_bytes(bytes: &[u8]) -> Result<SymbolFile, SymbolError> {

        Self::parse(bytes, |_| ())

    // Parse a SymbolFile from a file.

    pub fn from_file(path: &Path) -> Result<SymbolFile, SymbolError> {

        let file = File::open(path)?;

        Self::parse(file, |_| ())

    /// Fill in as much source information for `frame` as possible.

    pub fn fill_symbol(&self, module: &dyn Module, frame: &mut dyn FrameSymbolizer) {

        // Look for a FUNC covering the address first.

        if frame.get_instruction() < module.base_address() {

            return;

        let addr = frame.get_instruction() - module.base_address();

        if let Some(func) = self.functions.get(addr) {

            // TODO: although FUNC records have a parameter size, it appears that

            // they aren't to be trusted? The STACK WIN records are more reliable

            // when available. This is important precisely because these values

            // are used to unwind subsequent STACK WIN frames (because certain

            // calling conventions have the caller push the callee's arguments,

            // which affects the the stack's size!).

//

            // Need to spend more time thinking about if this is the right approach

            let parameter_size = if let Some(info) = self.win_stack_framedata_info.get(addr) {

                info.parameter_size

            } else if let Some(info) = self.win_stack_fpo_info.get(addr) {

                info.parameter_size

            } else {

                func.parameter_size

};

            frame.set_function(

                &func.name,

                func.address + module.base_address(),

                parameter_size,

);

            // See if there's source line and inline info as well.

//

            // In the following, we transform data between two different representations of inline calls.

            // The input shape has function names associated with the location of the call to that function.

            // The output shape has function names associated with a location *inside* that function.

//

            // Input:

//

            //   (

            //       outer_name,

            //       inline_calls: [ // Each location is the line of the *call* to the function

            //           (inline_call_location[0], inline_name[0]),

            //           (inline_call_location[1], inline_name[1]),

            //           (inline_call_location[2], inline_name[2]),

            //       ]

            //       innermost_location,

            //   )

//

            // Output:

//

            //   ( // Each location is the line *inside* the function

            //       (outer_name, inline_call_location[0]),

            //       inlines: [

            //           (inline_name[0], inline_call_location[1]),

            //           (inline_name[1], inline_call_location[2]),

            //           (inline_name[2], innermost_location),

            //       ]

            //   )

            if let Some((file_id, line, address, next_inline_origin)) =

                func.get_outermost_sourceloc(addr)

                if let Some(file) = self.files.get(&file_id) {

                    frame.set_source_file(file, line, address + module.base_address());

                if let Some(mut inline_origin) = next_inline_origin {

                    // There is an inline call at the address.

                    // Enumerate all inlines at the address one by one by looking up

                    // successively deeper call depths.

                    // The call to `get_outermost_source_location` above looked up depth 0, so here

                    // we start at depth 1.

                    for depth in 1.. {

                        match func.get_inlinee_at_depth(depth, addr) {

                            Some((call_file_id, call_line, _address, next_inline_origin)) => {

                                // We found another inline frame.

                                let call_file = self.files.get(&call_file_id).map(Deref::deref);

                                if let Some(name) = self.inline_origins.get(&inline_origin) {

                                    frame.add_inline_frame(name, call_file, Some(call_line));

                                inline_origin = next_inline_origin;

                            None => break,

                    // We've run out of inline calls but we still have to output the final frame.

                    let (file, line) = match func.get_innermost_sourceloc(addr) {

                        Some((file_id, line, _)) => (

                            self.files.get(&file_id).map(Deref::deref),

                            if line != 0 { Some(line) } else { None },

),

                        None => (None, None),

};

                    if let Some(name) = self.inline_origins.get(&inline_origin) {

                        frame.add_inline_frame(name, file, line);

        } else if let Some(public) = self.find_nearest_public(addr) {

            // We couldn't find a valid FUNC record, but we could find a PUBLIC record.

            // Unfortauntely, PUBLIC records don't have end-points, so this could be

            // a random PUBLIC record from the start of the module that isn't at all

            // applicable. To try limit this problem, we can use the nearest FUNC

            // record that comes *before* the address we're trying to find a symbol for.

//

            // It is reasonable to assume a PUBLIC record cannot extend *past* a FUNC,

            // so if the PUBLIC has a smaller base address than the nearest previous FUNC

            // to our target address, the PUBLIC must actually end before that FUNC and

            // therefore not actually apply to the target address.

//

            // We get the nearest previous FUNC by getting the raw slice of ranges

            // and binary searching for our base address. Rust's builtin binary search

            // will fail to find the value since it uses strict equality *but* the Err

            // will helpfully contain the index in the slice where our value "should"

            // be inserted to preserve the sort. The element before this index is

            // therefore the nearest previous value!

//

            // Case analysis for this -1 because binary search is an off-by-one minefield:

//

            // * if the address we were looking for came *before* every FUNC, binary_search

            //   would yield "0" because that's where it should go to preserve the sort.

            //   The checked_sub will then fail and make us just assume the PUBLIC is reasonable,

            //   which is correct.

//

            // * if we get 1, this saying we actually want element 0, so again -1 is

            //   correct. (This generalizes to all other "reasonable" values, but 1 is easiest

            //   to think about given the previous case's analysis.)

//

            // * if the address we were looking for came *after* every FUNC, binary search

            //   would yield "slice.len()", and the nearest FUNC is indeed at `len-1`, so

            //   again correct.

            let funcs_slice = self.functions.ranges_values().as_slice();

            let prev_func = funcs_slice

                .binary_search_by_key(&addr, |(range, _)| range.start)

                .err()

                .and_then(|idx| idx.checked_sub(1))

                .and_then(|idx| funcs_slice.get(idx));

            if let Some(prev_func) = prev_func {

                if public.address <= prev_func.1.address {

                    // This PUBLIC is truncated by a FUNC before it gets to `addr`,

                    // so we shouldn't use it.

                    return;

            // Settle for a PUBLIC.

            frame.set_function(

                &public.name,

                public.address + module.base_address(),

                public.parameter_size,

);

    pub fn walk_frame(&self, module: &dyn Module, walker: &mut dyn FrameWalker) -> Option<()> {

        if walker.get_instruction() < module.base_address() {

            return None;

        let addr = walker.get_instruction() - module.base_address();

        // Preferentially use framedata over fpo, because if both are present,

        // the former tends to be more precise (breakpad heuristic).

        let win_stack_result = if let Some(info) = self.win_stack_framedata_info.get(addr) {

            walker::walk_with_stack_win_framedata(info, walker)

        } else if let Some(info) = self.win_stack_fpo_info.get(addr) {

            walker::walk_with_stack_win_fpo(info, walker)

        } else {

            None

};

        // If STACK WIN failed, try STACK CFI

        win_stack_result.or_else(|| {

            if let Some(info) = self.cfi_stack_info.get(addr) {

                // Don't use add_rules that come after this address

                let mut count = 0;

                let len = info.add_rules.len();

                while count < len && info.add_rules[count].address <= addr {

                    count += 1;

                walker::walk_with_stack_cfi(&info.init, &info.add_rules[0..count], walker)

            } else {

                None

})

    /// Find the nearest `PublicSymbol` whose address is less than or equal to `addr`.

    pub fn find_nearest_public(&self, addr: u64) -> Option<&PublicSymbol> {

        self.publics.iter().rev().find(|&p| p.address <= addr)

#[cfg(test)]

mod test {

    use super::*;

    use std::ffi::OsStr;

    fn test_symbolfile_from_file(rel_path: &str) {

        let mut path = std::env::current_dir().unwrap();

        if path.file_name() == Some(OsStr::new("rust-minidump")) {

            path.push("breakpad-symbols");

        path.push(rel_path);

        let sym = SymbolFile::from_file(&path).unwrap();

        assert_eq!(sym.files.len(), 6661);

        assert_eq!(sym.publics.len(), 5);

        assert_eq!(sym.find_nearest_public(0x9b07).unwrap().name, "_NLG_Return");

        assert_eq!(

            sym.find_nearest_public(0x142e7).unwrap().name,

            "_NLG_Return"

);

        assert_eq!(

            sym.find_nearest_public(0x23b06).unwrap().name,

            "__from_strstr_to_strchr"

);

        assert_eq!(

            sym.find_nearest_public(0xFFFFFFFF).unwrap().name,

            "__from_strstr_to_strchr"

);

        assert_eq!(sym.functions.ranges_values().count(), 1065);

        assert_eq!(sym.functions.get(0x1000).unwrap().name, "vswprintf");

        assert_eq!(sym.functions.get(0x1012).unwrap().name, "vswprintf");

        assert!(sym.functions.get(0x1013).is_none());

        // There are 1556 `STACK WIN 4` lines in the symbol file, but only 856

        // that don't overlap. However they all overlap in ways that we have

        // to handle in the wild.

        assert_eq!(sym.win_stack_framedata_info.ranges_values().count(), 1556);

        assert_eq!(sym.win_stack_fpo_info.ranges_values().count(), 259);

        assert_eq!(

            sym.win_stack_framedata_info.get(0x41b0).unwrap().address,

            0x41b0

);

    #[test]

    fn test_symbolfile_from_lf_file() {

        test_symbolfile_from_file(

            "testdata/symbols/test_app.pdb/5A9832E5287241C1838ED98914E9B7FF1/test_app.sym",

);

    #[test]

    fn test_symbolfile_from_crlf_file() {

        test_symbolfile_from_file(

            "testdata/symbols/test_app.pdb/6A9832E5287241C1838ED98914E9B7FF1/test_app.sym",

);

    fn test_symbolfile_from_bytes(symbolfile_bytes: &[u8]) {

        let sym = SymbolFile::from_bytes(symbolfile_bytes).unwrap();

        assert_eq!(sym.files.len(), 1);

        assert_eq!(sym.publics.len(), 1);

        assert_eq!(sym.functions.ranges_values().count(), 1);

        assert_eq!(sym.functions.get(0x1000).unwrap().name, "another func");

        assert_eq!(

            sym.functions

                .get(0x1000)

                .unwrap()

                .lines

                .ranges_values()

                .count(),

);

        // test fallback

        assert_eq!(sym.functions.get(0x1001).unwrap().name, "another func");

    #[test]

    fn test_symbolfile_from_bytes_with_lf() {

        test_symbolfile_from_bytes(

            b"MODULE Linux x86 ffff0000 bar

FILE 53 bar.c

PUBLIC 1234 10 some public

FUNC 1000 30 10 another func

1000 30 7 53

",

);

    #[test]

    fn test_symbolfile_from_bytes_with_crlf() {

        test_symbolfile_from_bytes(

            b"MODULE Linux x86 ffff0000 bar

FILE 53 bar.c

PUBLIC 1234 10 some public

FUNC 1000 30 10 another func

1000 30 7 53

",

);

Revision control

Copy as Markdown

Other Tools