dwarf.rs - mozsearch

comm-central/third_party/rust/wast/src/core/binary/dwarf.rs

Enable keyboard shortcuts

Revision control

Copy as Markdown

Other Tools

HG Web

//! Implementation of emitting DWARF debugging information for `*.wat` files.

//!

//! This is intended to be relatively simple but the goal is to enable emission

//! of DWARF sections which point back to the original `*.wat` file itself. This

//! enables debuggers like LLDB to debug `*.wat` files without necessarily

//! built-in knowledge of WebAssembly itself.

//!

//! Overall I was curious on weekend and decided to implement this. It's an

//! off-by-default crate feature and an off-by-default runtime feature of this

//! crate. Hopefully doesn't carry too much complexity with it while still being

//! easy/fun to play around with.

use crate::core::binary::{EncodeOptions, Encoder, GenerateDwarf, Names, RecOrType};

use crate::core::{InnerTypeKind, Local, ValType};

use crate::token::Span;

use gimli::write::{

    self, Address, AttributeValue, DwarfUnit, Expression, FileId, LineProgram, LineString,

    Sections, UnitEntryId, Writer,

};

use gimli::{Encoding, Format, LineEncoding, LittleEndian};

use std::cmp::Ordering;

use std::collections::HashMap;

use std::path::Path;

pub struct Dwarf<'a> {

    // Metadata configured at `Dwarf` creation time

    func_names: HashMap<u32, &'a str>,

    func_imports: u32,

    contents: &'a str,

    file_id: FileId,

    style: GenerateDwarf,

    types: &'a [RecOrType<'a>],

    dwarf: DwarfUnit,

    // Next function index when `start_func` is called

    next_func: u32,

    // Current `DW_TAG_subprogram` that's being built as part of `start_func`.

    cur_subprogram: Option<UnitEntryId>,

    cur_subprogram_instrs: usize,

    // Code-section-relative offset of the start of every function. Filled in

    // as part of `end_func`.

    sym_offsets: Vec<usize>,

    // Metadata tracking what the line/column information was at the specified

    // last offset.

    line: u64,

    column: u64,

    last_offset: usize,

    i32_ty: Option<UnitEntryId>,

    i64_ty: Option<UnitEntryId>,

    f32_ty: Option<UnitEntryId>,

    f64_ty: Option<UnitEntryId>,

impl<'a> Dwarf<'a> {

    /// Creates a new `Dwarf` from the specified configuration.

///

    /// * `func_imports` - the number of imported functions in this module, or

    ///   the index of the first defined function.

    /// * `opts` - encoding options, namely where whether DWARF is to be emitted

    ///   is configured.

    /// * `names` - the `name` custom section for this module, used to name

    ///   functions in DWARF.

    pub fn new(

        func_imports: u32,

        opts: &EncodeOptions<'a>,

        names: &Names<'a>,

        types: &'a [RecOrType<'a>],

    ) -> Option<Dwarf<'a>> {

        // This is a load-bearing `?` which notably short-circuits all DWARF

        // machinery entirely if this was not enabled at runtime.

        let (file, contents, style) = opts.dwarf_info?;

        let file = file.to_str()?;

        // Configure some initial `gimli::write` context.

        let encoding = Encoding {

            address_size: 4,

            format: Format::Dwarf32,

            version: 5,

};

        let mut dwarf = DwarfUnit::new(encoding);

        let (comp_dir, comp_file) = match (

            Path::new(file).parent().and_then(|s| s.to_str()),

            Path::new(file).file_name().and_then(|s| s.to_str()),

) {

            (Some(parent), Some(file_name)) if !parent.is_empty() => (parent, file_name),

            _ => (".", file),

};

        let comp_dir_ref = dwarf.strings.add(comp_dir);

        let comp_file_ref = dwarf.strings.add(comp_file);

        dwarf.unit.line_program = LineProgram::new(

            encoding,

            LineEncoding::default(),

            LineString::StringRef(comp_dir_ref),

            LineString::StringRef(comp_file_ref),

            None,

);

        let dir_id = dwarf.unit.line_program.default_directory();

        let file_id =

            dwarf

                .unit

                .line_program

                .add_file(LineString::StringRef(comp_file_ref), dir_id, None);

        // Configure a single compilation unit which encompasses the entire code

        // section. The code section isn't fully known at this point so only a

        // "low pc" is emitted here.

        let root = dwarf.unit.root();

        let cu = dwarf.unit.get_mut(root);

        cu.set(

            gimli::DW_AT_producer,

            AttributeValue::String(format!("wast {}", env!("CARGO_PKG_VERSION")).into_bytes()),

);

        cu.set(

            gimli::DW_AT_language,

            // Technically this should be something like wasm or wat but that

            // doesn't exist so fill in something here.

            AttributeValue::Language(gimli::DW_LANG_C),

);

        cu.set(gimli::DW_AT_name, AttributeValue::StringRef(comp_file_ref));

        cu.set(

            gimli::DW_AT_comp_dir,

            AttributeValue::StringRef(comp_dir_ref),

);

        cu.set(gimli::DW_AT_low_pc, AttributeValue::Data4(0));

        // Build a lookup table for defined function index to its name.

        let mut func_names = HashMap::new();

        for (idx, name) in names.funcs.iter() {

            func_names.insert(*idx, *name);

        // Offsets pointing to newlines are considered internally as the 0th

        // column of the next line, so handle the case that the contents start

        // with a newline.

        let (line, column) = if contents.starts_with("\n") {

            (2, 0)

        } else {

            (1, 1)

};

        Some(Dwarf {

            dwarf,

            style,

            next_func: func_imports,

            func_imports,

            sym_offsets: Vec::new(),

            contents,

            line,

            column,

            last_offset: 0,

            file_id,

            cur_subprogram: None,

            cur_subprogram_instrs: 0,

            func_names,

            i32_ty: None,

            i64_ty: None,

            f32_ty: None,

            f64_ty: None,

            types,

})

    /// Start emitting a new function defined at `span`.

///

    /// This will start a new line program for this function and additionally

    /// configure a new `DW_TAG_subprogram` for this new function.

    pub fn start_func(&mut self, span: Span, ty: u32, locals: &[Local<'_>]) {

        self.change_linecol(span);

        let addr = Address::Symbol {

            symbol: (self.next_func - self.func_imports) as usize,

            addend: 0,

};

        self.dwarf.unit.line_program.begin_sequence(Some(addr));

        let root = self.dwarf.unit.root();

        let subprogram = self.dwarf.unit.add(root, gimli::DW_TAG_subprogram);

        let entry = self.dwarf.unit.get_mut(subprogram);

        let fallback = format!("wasm-function[{}]", self.next_func);

        let func_name = self

            .func_names

            .get(&self.next_func)

            .copied()

            .unwrap_or(&fallback);

        entry.set(gimli::DW_AT_name, AttributeValue::String(func_name.into()));

        entry.set(

            gimli::DW_AT_decl_file,

            AttributeValue::FileIndex(Some(self.file_id)),

);

        entry.set(gimli::DW_AT_decl_line, AttributeValue::Udata(self.line));

        entry.set(gimli::DW_AT_decl_column, AttributeValue::Udata(self.column));

        entry.set(gimli::DW_AT_external, AttributeValue::FlagPresent);

        entry.set(gimli::DW_AT_low_pc, AttributeValue::Address(addr));

        if let GenerateDwarf::Full = self.style {

            self.add_func_params_and_locals(subprogram, ty, locals);

        self.cur_subprogram = Some(subprogram);

        self.cur_subprogram_instrs = 0;

        self.next_func += 1;

    /// Adds `DW_TAG_formal_parameter` and `DW_TAG_variable` for all locals

    /// (which are both params and function-defined locals).

///

    /// This is pretty simple in that the expression for the location of

    /// these variables is constant, it's just "it's the local", and it spans

    /// the entire function.

    fn add_func_params_and_locals(

        &mut self,

        subprogram: UnitEntryId,

        ty: u32,

        locals: &[Local<'_>],

) {

        // Iterate through `self.types` which is what was encoded into the

        // module and find the function type which gives access to the

        // parameters which gives access to their types.

        let ty = self

            .types

            .iter()

            .flat_map(|t| match t {

                RecOrType::Type(t) => std::slice::from_ref(*t),

                RecOrType::Rec(r) => &r.types,

})

            .nth(ty as usize);

        let ty = match ty.map(|t| &t.def.kind) {

            Some(InnerTypeKind::Func(ty)) => ty,

            _ => return,

};

        let mut local_idx = 0;

        for (_, _, ty) in ty.params.iter() {

            self.local(local_idx, subprogram, gimli::DW_TAG_formal_parameter, ty);

            local_idx += 1;

        for local in locals {

            self.local(local_idx, subprogram, gimli::DW_TAG_variable, &local.ty);

            local_idx += 1;

    /// Attempts to define a local variable within `subprogram` with the `ty`

    /// given.

///

    /// This does nothing if `ty` can't be represented in DWARF.

    fn local(&mut self, local: u32, subprogram: UnitEntryId, tag: gimli::DwTag, ty: &ValType<'_>) {

        let ty = match self.val_type_to_dwarf_type(ty) {

            Some(ty) => ty,

            None => return,

};

        let param = self.dwarf.unit.add(subprogram, tag);

        let entry = self.dwarf.unit.get_mut(param);

        entry.set(

            gimli::DW_AT_name,

            AttributeValue::String(format!("local{local}").into()),

);

        let mut loc = Expression::new();

        loc.op_wasm_local(local);

        loc.op(gimli::DW_OP_stack_value);

        entry.set(gimli::DW_AT_location, AttributeValue::Exprloc(loc));

        entry.set(gimli::DW_AT_type, AttributeValue::UnitRef(ty));

    fn val_type_to_dwarf_type(&mut self, ty: &ValType<'_>) -> Option<UnitEntryId> {

        match ty {

            ValType::I32 => Some(self.i32_ty()),

            ValType::I64 => Some(self.i64_ty()),

            ValType::F32 => Some(self.f32_ty()),

            ValType::F64 => Some(self.f64_ty()),

            // TODO: make a C union of sorts or something like that to

            // represent v128 as an array-of-lanes or u128 or something like

            // that.

            ValType::V128 => None,

            // Not much that can be done about reference types without actually

            // knowing what the engine does, this probably needs an addition to

            // DWARF itself to represent this.

            ValType::Ref(_) => None,

    /// Emit an instruction which starts at `offset` and is defined at `span`.

///

    /// Note that `offset` is code-section-relative.

    pub fn instr(&mut self, offset: usize, span: Span) {

        self.change_linecol(span);

        let offset = u64::try_from(offset).unwrap();

        let mut changed = false;

        let row = self.dwarf.unit.line_program.row();

        set(&mut row.address_offset, offset, &mut changed);

        set(&mut row.line, self.line, &mut changed);

        set(&mut row.column, self.column, &mut changed);

        set(&mut row.file, self.file_id, &mut changed);

        set(&mut row.is_statement, true, &mut changed);

        set(

            &mut row.prologue_end,

            self.cur_subprogram_instrs == 0,

            &mut changed,

);

        if changed {

            self.dwarf.unit.line_program.generate_row();

        self.cur_subprogram_instrs += 1;

        fn set<T: PartialEq>(slot: &mut T, val: T, changed: &mut bool) {

            if *slot != val {

                *slot = val;

                *changed = true;

    /// Change `self.line` and `self.column` to be appropriate for the offset

    /// in `span`.

///

    /// This will incrementally move from `self.last_offset` to `span.offset()`

    /// and update line/column information as we go. It's assumed that this is

    /// more efficient than a precalculate-all-the-positions-for-each-byte

    /// approach since that would require a great deal of memory to store a

    /// line/col for all bytes in the input string. It's also assumed that most

    /// `span` adjustments are minor as it's between instructions in a function

    /// which are frequently close together. Whether or not this assumption

    /// pans out is yet to be seen.

    fn change_linecol(&mut self, span: Span) {

        let offset = span.offset();

        loop {

            match self.last_offset.cmp(&offset) {

                Ordering::Less => {

                    let haystack = self.contents[self.last_offset + 1..].as_bytes();

                    let next_newline = match memchr::memchr_iter(b'\n', haystack).next() {

                        Some(pos) => pos + self.last_offset + 1,

                        None => break,

};

                    if next_newline > offset {

                        break;

                    } else {

                        self.line += 1;

                        self.column = 0;

                        self.last_offset = next_newline;

                Ordering::Greater => {

                    let haystack = self.contents[..self.last_offset].as_bytes();

                    match memchr::memchr_iter(b'\n', haystack).next_back() {

                        Some(prev_newline) => {

                            if self.column == 0 {

                                self.line -= 1;

                            self.column = 0;

                            self.last_offset = prev_newline;

                        None => {

                            self.line = 1;

                            self.column = 1;

                            self.last_offset = 0;

                Ordering::Equal => break,

        match self.last_offset.cmp(&offset) {

            Ordering::Less => {

                self.column += (offset - self.last_offset) as u64;

            Ordering::Greater => {

                self.column -= (self.last_offset - offset) as u64;

            Ordering::Equal => {}

        self.last_offset = offset;

    /// Completes emission of the latest function.

///

    /// The latest function took `func_size` bytes to encode and the current end

    /// of the code section, after the function was appended, is

    /// `code_section_end`.

    pub fn end_func(&mut self, func_size: usize, code_section_end: usize) {

        // Add a final row corresponding to the final `end` instruction in the

        // function to ensure there's something for all bytecodes.

        let row = self.dwarf.unit.line_program.row();

        row.address_offset = (func_size - 1) as u64;

        self.dwarf.unit.line_program.generate_row();

        // This function's symbol is relative to the start of the function

        // itself. Functions are encoded as a leb-size-of-function then the

        // function itself, so to account for the size of the

        // leb-size-of-function we calculate the function start as the current

        // end of the code section minus the size of the function's bytes.

        self.sym_offsets.push(code_section_end - func_size);

        // The line program is relative to the start address, so only the

        // function's size is used here.

        self.dwarf

            .unit

            .line_program

            .end_sequence(u64::try_from(func_size).unwrap());

        // The high PC value here is relative to `DW_AT_low_pc`, so it's the

        // size of the function.

        let entry = self.dwarf.unit.get_mut(self.cur_subprogram.take().unwrap());

        entry.set(

            gimli::DW_AT_high_pc,

            AttributeValue::Data4(func_size as u32),

);

    pub fn set_code_section_size(&mut self, size: usize) {

        let root = self.dwarf.unit.root();

        let entry = self.dwarf.unit.get_mut(root);

        entry.set(gimli::DW_AT_high_pc, AttributeValue::Data4(size as u32));

    pub fn emit(&mut self, dst: &mut Encoder<'_>) {

        let mut sections = Sections::new(DwarfWriter {

            sym_offsets: &self.sym_offsets,

            bytes: Vec::new(),

});

        self.dwarf.write(&mut sections).unwrap();

        sections

            .for_each(|id, writer| {

                if !writer.bytes.is_empty() {

                    dst.wasm.section(&wasm_encoder::CustomSection {

                        name: id.name().into(),

                        data: (&writer.bytes).into(),

});

                Ok::<_, std::convert::Infallible>(())

})

            .unwrap();

    fn i32_ty(&mut self) -> UnitEntryId {

        if self.i32_ty.is_none() {

            self.i32_ty = Some(self.mk_primitive("i32", 4, gimli::DW_ATE_signed));

        self.i32_ty.unwrap()

    fn i64_ty(&mut self) -> UnitEntryId {

        if self.i64_ty.is_none() {

            self.i64_ty = Some(self.mk_primitive("i64", 8, gimli::DW_ATE_signed));

        self.i64_ty.unwrap()

    fn f32_ty(&mut self) -> UnitEntryId {

        if self.f32_ty.is_none() {

            self.f32_ty = Some(self.mk_primitive("f32", 4, gimli::DW_ATE_float));

        self.f32_ty.unwrap()

    fn f64_ty(&mut self) -> UnitEntryId {

        if self.f64_ty.is_none() {

            self.f64_ty = Some(self.mk_primitive("f64", 8, gimli::DW_ATE_float));

        self.f64_ty.unwrap()

    fn mk_primitive(&mut self, name: &str, byte_size: u8, encoding: gimli::DwAte) -> UnitEntryId {

        let name = self.dwarf.strings.add(name);

        let root = self.dwarf.unit.root();

        let ty = self.dwarf.unit.add(root, gimli::DW_TAG_base_type);

        let entry = self.dwarf.unit.get_mut(ty);

        entry.set(gimli::DW_AT_name, AttributeValue::StringRef(name));

        entry.set(gimli::DW_AT_byte_size, AttributeValue::Data1(byte_size));

        entry.set(gimli::DW_AT_encoding, AttributeValue::Encoding(encoding));

ty

#[derive(Clone)]

struct DwarfWriter<'a> {

    sym_offsets: &'a [usize],

    bytes: Vec<u8>,

impl Writer for DwarfWriter<'_> {

    type Endian = LittleEndian;

    fn endian(&self) -> Self::Endian {

        LittleEndian

    fn len(&self) -> usize {

        self.bytes.len()

    fn write(&mut self, bytes: &[u8]) -> write::Result<()> {

        self.bytes.extend_from_slice(bytes);

        Ok(())

    fn write_at(&mut self, offset: usize, bytes: &[u8]) -> write::Result<()> {

        self.bytes[offset..][..bytes.len()].copy_from_slice(bytes);

        Ok(())

    fn write_address(&mut self, address: Address, size: u8) -> write::Result<()> {

        match address {

            Address::Constant(val) => self.write_udata(val, size),

            Address::Symbol { symbol, addend } => {

                assert_eq!(addend, 0);

                let offset = self.sym_offsets[symbol];

                self.write_udata(offset as u64, size)

#[cfg(test)]

mod tests {

    use super::{Dwarf, EncodeOptions, GenerateDwarf};

    use crate::token::Span;

    use rand::rngs::SmallRng;

    use rand::{Rng, SeedableRng};

    fn linecol_test(contents: &str) {

        let mut dwarf = Dwarf::new(

0,

            EncodeOptions::default().dwarf("foo.wat".as_ref(), contents, GenerateDwarf::Lines),

            &Default::default(),

            &[],

        .unwrap();

        // Print some debugging information in case someone's debugging this

        // test

        let mut offset = 0;

        for (i, line) in contents.lines().enumerate() {

            println!(

                "line {:2} is at {:2} .. {:2}",

                i + 1,

                offset,

                offset + line.len()

);

            offset += line.len() + 1;

        println!("");

        // Precalculate (line, col) for all characters, assumed to all be one

        // byte here.

        let mut precalculated_linecols = Vec::new();

        let mut line = 1;

        let mut col = 1;

        for c in contents.chars() {

            if c == '\n' {

                line += 1;

                col = 0;

            precalculated_linecols.push((line, col));

            col += 1;

        // Traverse randomly throughout this string and assert that the

        // incremental update matches the precalculated position.

        let mut rand = SmallRng::seed_from_u64(102);

        for _ in 0..1000 {

            let pos = rand.gen_range(0..contents.len());

            dwarf.change_linecol(Span::from_offset(pos));

            let (line, col) = precalculated_linecols[pos];

            assert_eq!(dwarf.line, line, "line mismatch");

            assert_eq!(dwarf.column, col, "column mismatch");

    #[test]

    fn linecol_simple() {

        linecol_test(

"a

        c (; ... ;)

         fg",

);

    #[test]

    fn linecol_empty() {

        linecol_test("x");

    #[test]

    fn linecol_start_newline() {

        linecol_test("\nx ab\nyyy \ncc");

    #[test]

    fn linecol_lots_of_newlines() {

        linecol_test("\n\n\n\n");

    #[test]

    fn linecol_interspersed() {

        linecol_test("\na\nb\nc\n");