Revision control

Copy as Markdown

Other Tools

//! Implementation of emitting DWARF debugging information for `*.wat` files.
//!
//! This is intended to be relatively simple but the goal is to enable emission
//! of DWARF sections which point back to the original `*.wat` file itself. This
//! enables debuggers like LLDB to debug `*.wat` files without necessarily
//! built-in knowledge of WebAssembly itself.
//!
//! Overall I was curious on weekend and decided to implement this. It's an
//! off-by-default crate feature and an off-by-default runtime feature of this
//! crate. Hopefully doesn't carry too much complexity with it while still being
//! easy/fun to play around with.
use crate::core::binary::{EncodeOptions, Encoder, GenerateDwarf, Names, RecOrType};
use crate::core::{InnerTypeKind, Local, ValType};
use crate::token::Span;
use gimli::write::{
self, Address, AttributeValue, DwarfUnit, Expression, FileId, LineProgram, LineString,
Sections, UnitEntryId, Writer,
};
use gimli::{Encoding, Format, LineEncoding, LittleEndian};
use std::cmp::Ordering;
use std::collections::HashMap;
use std::path::Path;
pub struct Dwarf<'a> {
// Metadata configured at `Dwarf` creation time
func_names: HashMap<u32, &'a str>,
func_imports: u32,
contents: &'a str,
file_id: FileId,
style: GenerateDwarf,
types: &'a [RecOrType<'a>],
dwarf: DwarfUnit,
// Next function index when `start_func` is called
next_func: u32,
// Current `DW_TAG_subprogram` that's being built as part of `start_func`.
cur_subprogram: Option<UnitEntryId>,
cur_subprogram_instrs: usize,
// Code-section-relative offset of the start of every function. Filled in
// as part of `end_func`.
sym_offsets: Vec<usize>,
// Metadata tracking what the line/column information was at the specified
// last offset.
line: u64,
column: u64,
last_offset: usize,
i32_ty: Option<UnitEntryId>,
i64_ty: Option<UnitEntryId>,
f32_ty: Option<UnitEntryId>,
f64_ty: Option<UnitEntryId>,
}
impl<'a> Dwarf<'a> {
/// Creates a new `Dwarf` from the specified configuration.
///
/// * `func_imports` - the number of imported functions in this module, or
/// the index of the first defined function.
/// * `opts` - encoding options, namely where whether DWARF is to be emitted
/// is configured.
/// * `names` - the `name` custom section for this module, used to name
/// functions in DWARF.
pub fn new(
func_imports: u32,
opts: &EncodeOptions<'a>,
names: &Names<'a>,
types: &'a [RecOrType<'a>],
) -> Option<Dwarf<'a>> {
// This is a load-bearing `?` which notably short-circuits all DWARF
// machinery entirely if this was not enabled at runtime.
let (file, contents, style) = opts.dwarf_info?;
let file = file.to_str()?;
// Configure some initial `gimli::write` context.
let encoding = Encoding {
address_size: 4,
format: Format::Dwarf32,
version: 5,
};
let mut dwarf = DwarfUnit::new(encoding);
let (comp_dir, comp_file) = match (
Path::new(file).parent().and_then(|s| s.to_str()),
Path::new(file).file_name().and_then(|s| s.to_str()),
) {
(Some(parent), Some(file_name)) if !parent.is_empty() => (parent, file_name),
_ => (".", file),
};
let comp_dir_ref = dwarf.strings.add(comp_dir);
let comp_file_ref = dwarf.strings.add(comp_file);
dwarf.unit.line_program = LineProgram::new(
encoding,
LineEncoding::default(),
LineString::StringRef(comp_dir_ref),
LineString::StringRef(comp_file_ref),
None,
);
let dir_id = dwarf.unit.line_program.default_directory();
let file_id =
dwarf
.unit
.line_program
.add_file(LineString::StringRef(comp_file_ref), dir_id, None);
// Configure a single compilation unit which encompasses the entire code
// section. The code section isn't fully known at this point so only a
// "low pc" is emitted here.
let root = dwarf.unit.root();
let cu = dwarf.unit.get_mut(root);
cu.set(
gimli::DW_AT_producer,
AttributeValue::String(format!("wast {}", env!("CARGO_PKG_VERSION")).into_bytes()),
);
cu.set(
gimli::DW_AT_language,
// Technically this should be something like wasm or wat but that
// doesn't exist so fill in something here.
AttributeValue::Language(gimli::DW_LANG_C),
);
cu.set(gimli::DW_AT_name, AttributeValue::StringRef(comp_file_ref));
cu.set(
gimli::DW_AT_comp_dir,
AttributeValue::StringRef(comp_dir_ref),
);
cu.set(gimli::DW_AT_low_pc, AttributeValue::Data4(0));
// Build a lookup table for defined function index to its name.
let mut func_names = HashMap::new();
for (idx, name) in names.funcs.iter() {
func_names.insert(*idx, *name);
}
// Offsets pointing to newlines are considered internally as the 0th
// column of the next line, so handle the case that the contents start
// with a newline.
let (line, column) = if contents.starts_with("\n") {
(2, 0)
} else {
(1, 1)
};
Some(Dwarf {
dwarf,
style,
next_func: func_imports,
func_imports,
sym_offsets: Vec::new(),
contents,
line,
column,
last_offset: 0,
file_id,
cur_subprogram: None,
cur_subprogram_instrs: 0,
func_names,
i32_ty: None,
i64_ty: None,
f32_ty: None,
f64_ty: None,
types,
})
}
/// Start emitting a new function defined at `span`.
///
/// This will start a new line program for this function and additionally
/// configure a new `DW_TAG_subprogram` for this new function.
pub fn start_func(&mut self, span: Span, ty: u32, locals: &[Local<'_>]) {
self.change_linecol(span);
let addr = Address::Symbol {
symbol: (self.next_func - self.func_imports) as usize,
addend: 0,
};
self.dwarf.unit.line_program.begin_sequence(Some(addr));
let root = self.dwarf.unit.root();
let subprogram = self.dwarf.unit.add(root, gimli::DW_TAG_subprogram);
let entry = self.dwarf.unit.get_mut(subprogram);
let fallback = format!("wasm-function[{}]", self.next_func);
let func_name = self
.func_names
.get(&self.next_func)
.copied()
.unwrap_or(&fallback);
entry.set(gimli::DW_AT_name, AttributeValue::String(func_name.into()));
entry.set(
gimli::DW_AT_decl_file,
AttributeValue::FileIndex(Some(self.file_id)),
);
entry.set(gimli::DW_AT_decl_line, AttributeValue::Udata(self.line));
entry.set(gimli::DW_AT_decl_column, AttributeValue::Udata(self.column));
entry.set(gimli::DW_AT_external, AttributeValue::FlagPresent);
entry.set(gimli::DW_AT_low_pc, AttributeValue::Address(addr));
if let GenerateDwarf::Full = self.style {
self.add_func_params_and_locals(subprogram, ty, locals);
}
self.cur_subprogram = Some(subprogram);
self.cur_subprogram_instrs = 0;
self.next_func += 1;
}
/// Adds `DW_TAG_formal_parameter` and `DW_TAG_variable` for all locals
/// (which are both params and function-defined locals).
///
/// This is pretty simple in that the expression for the location of
/// these variables is constant, it's just "it's the local", and it spans
/// the entire function.
fn add_func_params_and_locals(
&mut self,
subprogram: UnitEntryId,
ty: u32,
locals: &[Local<'_>],
) {
// Iterate through `self.types` which is what was encoded into the
// module and find the function type which gives access to the
// parameters which gives access to their types.
let ty = self
.types
.iter()
.flat_map(|t| match t {
RecOrType::Type(t) => std::slice::from_ref(*t),
RecOrType::Rec(r) => &r.types,
})
.nth(ty as usize);
let ty = match ty.map(|t| &t.def.kind) {
Some(InnerTypeKind::Func(ty)) => ty,
_ => return,
};
let mut local_idx = 0;
for (_, _, ty) in ty.params.iter() {
self.local(local_idx, subprogram, gimli::DW_TAG_formal_parameter, ty);
local_idx += 1;
}
for local in locals {
self.local(local_idx, subprogram, gimli::DW_TAG_variable, &local.ty);
local_idx += 1;
}
}
/// Attempts to define a local variable within `subprogram` with the `ty`
/// given.
///
/// This does nothing if `ty` can't be represented in DWARF.
fn local(&mut self, local: u32, subprogram: UnitEntryId, tag: gimli::DwTag, ty: &ValType<'_>) {
let ty = match self.val_type_to_dwarf_type(ty) {
Some(ty) => ty,
None => return,
};
let param = self.dwarf.unit.add(subprogram, tag);
let entry = self.dwarf.unit.get_mut(param);
entry.set(
gimli::DW_AT_name,
AttributeValue::String(format!("local{local}").into()),
);
let mut loc = Expression::new();
loc.op_wasm_local(local);
loc.op(gimli::DW_OP_stack_value);
entry.set(gimli::DW_AT_location, AttributeValue::Exprloc(loc));
entry.set(gimli::DW_AT_type, AttributeValue::UnitRef(ty));
}
fn val_type_to_dwarf_type(&mut self, ty: &ValType<'_>) -> Option<UnitEntryId> {
match ty {
ValType::I32 => Some(self.i32_ty()),
ValType::I64 => Some(self.i64_ty()),
ValType::F32 => Some(self.f32_ty()),
ValType::F64 => Some(self.f64_ty()),
// TODO: make a C union of sorts or something like that to
// represent v128 as an array-of-lanes or u128 or something like
// that.
ValType::V128 => None,
// Not much that can be done about reference types without actually
// knowing what the engine does, this probably needs an addition to
// DWARF itself to represent this.
ValType::Ref(_) => None,
}
}
/// Emit an instruction which starts at `offset` and is defined at `span`.
///
/// Note that `offset` is code-section-relative.
pub fn instr(&mut self, offset: usize, span: Span) {
self.change_linecol(span);
let offset = u64::try_from(offset).unwrap();
let mut changed = false;
let row = self.dwarf.unit.line_program.row();
set(&mut row.address_offset, offset, &mut changed);
set(&mut row.line, self.line, &mut changed);
set(&mut row.column, self.column, &mut changed);
set(&mut row.file, self.file_id, &mut changed);
set(&mut row.is_statement, true, &mut changed);
set(
&mut row.prologue_end,
self.cur_subprogram_instrs == 0,
&mut changed,
);
if changed {
self.dwarf.unit.line_program.generate_row();
}
self.cur_subprogram_instrs += 1;
fn set<T: PartialEq>(slot: &mut T, val: T, changed: &mut bool) {
if *slot != val {
*slot = val;
*changed = true;
}
}
}
/// Change `self.line` and `self.column` to be appropriate for the offset
/// in `span`.
///
/// This will incrementally move from `self.last_offset` to `span.offset()`
/// and update line/column information as we go. It's assumed that this is
/// more efficient than a precalculate-all-the-positions-for-each-byte
/// approach since that would require a great deal of memory to store a
/// line/col for all bytes in the input string. It's also assumed that most
/// `span` adjustments are minor as it's between instructions in a function
/// which are frequently close together. Whether or not this assumption
/// pans out is yet to be seen.
fn change_linecol(&mut self, span: Span) {
let offset = span.offset();
loop {
match self.last_offset.cmp(&offset) {
Ordering::Less => {
let haystack = self.contents[self.last_offset + 1..].as_bytes();
let next_newline = match memchr::memchr_iter(b'\n', haystack).next() {
Some(pos) => pos + self.last_offset + 1,
None => break,
};
if next_newline > offset {
break;
} else {
self.line += 1;
self.column = 0;
self.last_offset = next_newline;
}
}
Ordering::Greater => {
let haystack = self.contents[..self.last_offset].as_bytes();
match memchr::memchr_iter(b'\n', haystack).next_back() {
Some(prev_newline) => {
if self.column == 0 {
self.line -= 1;
}
self.column = 0;
self.last_offset = prev_newline;
}
None => {
self.line = 1;
self.column = 1;
self.last_offset = 0;
}
}
}
Ordering::Equal => break,
}
}
match self.last_offset.cmp(&offset) {
Ordering::Less => {
self.column += (offset - self.last_offset) as u64;
}
Ordering::Greater => {
self.column -= (self.last_offset - offset) as u64;
}
Ordering::Equal => {}
}
self.last_offset = offset;
}
/// Completes emission of the latest function.
///
/// The latest function took `func_size` bytes to encode and the current end
/// of the code section, after the function was appended, is
/// `code_section_end`.
pub fn end_func(&mut self, func_size: usize, code_section_end: usize) {
// Add a final row corresponding to the final `end` instruction in the
// function to ensure there's something for all bytecodes.
let row = self.dwarf.unit.line_program.row();
row.address_offset = (func_size - 1) as u64;
self.dwarf.unit.line_program.generate_row();
// This function's symbol is relative to the start of the function
// itself. Functions are encoded as a leb-size-of-function then the
// function itself, so to account for the size of the
// leb-size-of-function we calculate the function start as the current
// end of the code section minus the size of the function's bytes.
self.sym_offsets.push(code_section_end - func_size);
// The line program is relative to the start address, so only the
// function's size is used here.
self.dwarf
.unit
.line_program
.end_sequence(u64::try_from(func_size).unwrap());
// The high PC value here is relative to `DW_AT_low_pc`, so it's the
// size of the function.
let entry = self.dwarf.unit.get_mut(self.cur_subprogram.take().unwrap());
entry.set(
gimli::DW_AT_high_pc,
AttributeValue::Data4(func_size as u32),
);
}
pub fn set_code_section_size(&mut self, size: usize) {
let root = self.dwarf.unit.root();
let entry = self.dwarf.unit.get_mut(root);
entry.set(gimli::DW_AT_high_pc, AttributeValue::Data4(size as u32));
}
pub fn emit(&mut self, dst: &mut Encoder<'_>) {
let mut sections = Sections::new(DwarfWriter {
sym_offsets: &self.sym_offsets,
bytes: Vec::new(),
});
self.dwarf.write(&mut sections).unwrap();
sections
.for_each(|id, writer| {
if !writer.bytes.is_empty() {
dst.custom_section(id.name(), &writer.bytes);
}
Ok::<_, std::convert::Infallible>(())
})
.unwrap();
}
fn i32_ty(&mut self) -> UnitEntryId {
if self.i32_ty.is_none() {
self.i32_ty = Some(self.mk_primitive("i32", 4, gimli::DW_ATE_signed));
}
self.i32_ty.unwrap()
}
fn i64_ty(&mut self) -> UnitEntryId {
if self.i64_ty.is_none() {
self.i64_ty = Some(self.mk_primitive("i64", 8, gimli::DW_ATE_signed));
}
self.i64_ty.unwrap()
}
fn f32_ty(&mut self) -> UnitEntryId {
if self.f32_ty.is_none() {
self.f32_ty = Some(self.mk_primitive("f32", 4, gimli::DW_ATE_float));
}
self.f32_ty.unwrap()
}
fn f64_ty(&mut self) -> UnitEntryId {
if self.f64_ty.is_none() {
self.f64_ty = Some(self.mk_primitive("f64", 8, gimli::DW_ATE_float));
}
self.f64_ty.unwrap()
}
fn mk_primitive(&mut self, name: &str, byte_size: u8, encoding: gimli::DwAte) -> UnitEntryId {
let name = self.dwarf.strings.add(name);
let root = self.dwarf.unit.root();
let ty = self.dwarf.unit.add(root, gimli::DW_TAG_base_type);
let entry = self.dwarf.unit.get_mut(ty);
entry.set(gimli::DW_AT_name, AttributeValue::StringRef(name));
entry.set(gimli::DW_AT_byte_size, AttributeValue::Data1(byte_size));
entry.set(gimli::DW_AT_encoding, AttributeValue::Encoding(encoding));
ty
}
}
#[derive(Clone)]
struct DwarfWriter<'a> {
sym_offsets: &'a [usize],
bytes: Vec<u8>,
}
impl Writer for DwarfWriter<'_> {
type Endian = LittleEndian;
fn endian(&self) -> Self::Endian {
LittleEndian
}
fn len(&self) -> usize {
self.bytes.len()
}
fn write(&mut self, bytes: &[u8]) -> write::Result<()> {
self.bytes.extend_from_slice(bytes);
Ok(())
}
fn write_at(&mut self, offset: usize, bytes: &[u8]) -> write::Result<()> {
self.bytes[offset..][..bytes.len()].copy_from_slice(bytes);
Ok(())
}
fn write_address(&mut self, address: Address, size: u8) -> write::Result<()> {
match address {
Address::Constant(val) => self.write_udata(val, size),
Address::Symbol { symbol, addend } => {
assert_eq!(addend, 0);
let offset = self.sym_offsets[symbol];
self.write_udata(offset as u64, size)
}
}
}
}
#[cfg(test)]
mod tests {
use super::{Dwarf, EncodeOptions, GenerateDwarf};
use crate::token::Span;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
fn linecol_test(contents: &str) {
let mut dwarf = Dwarf::new(
0,
EncodeOptions::default().dwarf("foo.wat".as_ref(), contents, GenerateDwarf::Lines),
&Default::default(),
&[],
)
.unwrap();
// Print some debugging information in case someone's debugging this
// test
let mut offset = 0;
for (i, line) in contents.lines().enumerate() {
println!(
"line {:2} is at {:2} .. {:2}",
i + 1,
offset,
offset + line.len()
);
offset += line.len() + 1;
}
println!("");
// Precalculate (line, col) for all characters, assumed to all be one
// byte here.
let mut precalculated_linecols = Vec::new();
let mut line = 1;
let mut col = 1;
for c in contents.chars() {
if c == '\n' {
line += 1;
col = 0;
}
precalculated_linecols.push((line, col));
col += 1;
}
// Traverse randomly throughout this string and assert that the
// incremental update matches the precalculated position.
let mut rand = SmallRng::seed_from_u64(102);
for _ in 0..1000 {
let pos = rand.gen_range(0..contents.len());
dwarf.change_linecol(Span::from_offset(pos));
let (line, col) = precalculated_linecols[pos];
assert_eq!(dwarf.line, line, "line mismatch");
assert_eq!(dwarf.column, col, "column mismatch");
}
}
#[test]
fn linecol_simple() {
linecol_test(
"a
b
c (; ... ;)
d
e
f
fg",
);
}
#[test]
fn linecol_empty() {
linecol_test("x");
}
#[test]
fn linecol_start_newline() {
linecol_test("\nx ab\nyyy \ncc");
}
#[test]
fn linecol_lots_of_newlines() {
linecol_test("\n\n\n\n");
}
#[test]
fn linecol_interspersed() {
linecol_test("\na\nb\nc\n");
}
}