tarcieri · Finomnis · May 31, 2023 · May 31, 2023 · May 31, 2023 · May 31, 2023
diff --git a/examples/benchmark_teensy40/.cargo/config.toml b/examples/benchmark_teensy40/.cargo/config.toml
@@ -0,0 +1,6 @@
+[target.thumbv7em-none-eabihf]
+runner = "python run.py"
+rustflags = ["-C", "link-arg=-Tt4link.x"]
+
+[build]
+target = "thumbv7em-none-eabihf" # Teensy 4
diff --git a/examples/benchmark_teensy40/.gitignore b/examples/benchmark_teensy40/.gitignore
@@ -0,0 +1 @@
+/target/
diff --git a/examples/benchmark_teensy40/.vscode/settings.json b/examples/benchmark_teensy40/.vscode/settings.json
@@ -0,0 +1,13 @@
+{
+    "rust-analyzer.cargo.target": "thumbv7em-none-eabihf",
+    // override the default setting (`cargo check --all-targets`) which produces the following error
+    // "can't find crate for `test`" when the default compilation target is a no_std target
+    // with these changes RA will call `cargo check --bins` on save
+    "rust-analyzer.checkOnSave.allTargets": false,
+    "rust-analyzer.checkOnSave.extraArgs": [
+        "--bins"
+    ],
+    "rust-analyzer.linkedProjects": [
+        "Cargo.toml",
+    ],
+}
diff --git a/examples/benchmark_teensy40/Cargo.toml b/examples/benchmark_teensy40/Cargo.toml
@@ -0,0 +1,43 @@
+[package]
+authors = ["Finomnis <[email protected]>"]
+name = "micromath_benchmark_teensy40"
+edition = "2021"
+version = "0.1.0"
+
+[dependencies]
+# Base dependencies
+cortex-m = "0.7.7"
+embedded-hal = "0.2.7"
+teensy4-panic = "0.2.2"
+
+# Board support package
+teensy4-bsp = { version = "0.4.2", features = ["rt"] }
+
+# Packages necessary for board usage
+nb = "1.1.0"    # Async
+fugit = "0.3.6" # Time
+
+# Project dependencies
+git-version = "0.3.5"
+micromath = { path = "../../" }
+libm = "0.2.7"
+
+
+# cargo build/run
+[profile.dev]
+codegen-units = 1
+debug = 2
+debug-assertions = true # <-
+incremental = false
+opt-level = 'z'         # <-
+overflow-checks = true  # <-
+
+# cargo build/run --release
+[profile.release]
+codegen-units = 1
+debug = 2
+debug-assertions = false # <-
+incremental = false
+lto = 'fat'
+opt-level = 3            # <-
+overflow-checks = false  # <-
diff --git a/examples/benchmark_teensy40/README.md b/examples/benchmark_teensy40/README.md
@@ -0,0 +1,53 @@
+# Micromod Benchmark Suite
+
+*- intended for the [Teensy 4.0](https://www.pjrc.com/store/teensy40.html) board -*
+
+## Prerequisites
+
+The following hardware is required for the benchmark:
+- A [Teensy 4.0](https://www.pjrc.com/store/teensy40.html) development board
+- A way to read the Teensy's UART, like a USB-UART-converter
+
+The following software tools have to be installed:
+- Python3 (as `python`, or modify `run.py` to use the `python3` binary)
+- [`llvm-objcopy`](https://github.com/rust-lang/rust/issues/85658)
+  - Install via `rustup component add llvm-tools-preview`
+- [`teensy-loader-cli`](https://www.pjrc.com/teensy/loader_cli.html)
+
+
+## Run
+
+- Connect the Teensy to PC via USB cable.
+- Run `cargo run --release`.
+- Read the output of the benchmark on the Teensy's UART.
+
+## Results
+
+```none
+===== Micromath Benchmark =====
+Git Version: deb8988
+
+All values in ns/iter.
+
+            micromath       libm intrinsics
+abs              15.0       15.0       10.0
+acos            173.3      120.0
+asin            103.3      136.7
+atan             80.0       78.3
+atan_norm        75.0
+ceil             43.3       41.7
+cos              98.3     1655.0
+exp             231.7      115.0
+floor            36.7       38.3
+fract            20.0
+inv              15.0
+invsqrt          15.0
+ln              158.3      145.0
+log2            161.7      155.0
+log10           161.7      160.0
+round            30.0
+sin             103.3     1703.3
+sqrt             46.7      470.0       31.7
+tan             153.3     2531.3
+trunc            21.7       43.3
+```
diff --git a/examples/benchmark_teensy40/run.py b/examples/benchmark_teensy40/run.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python
+
+from pathlib import Path
+from tempfile import TemporaryDirectory
+
+import subprocess
+import sys
+
+
+def main():
+    if len(sys.argv) < 2:
+        print("Please provide the binary as first argument!")
+        exit(1)
+
+    binary = sys.argv[1]
+
+    with TemporaryDirectory() as tmpdir:
+        tmpdir = Path(tmpdir)
+        hexfile = tmpdir / "firmware.hex"
+
+        subprocess.run(["llvm-objcopy", "-O", "ihex", binary, hexfile], check=True)
+        subprocess.run(["teensy_loader_cli", "--mcu=TEENSY40", "-wsv", hexfile], check=True)
+
+    print("Teensy40 successfully flashed. Read the results of the benchmark from its UART2.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/benchmark_teensy40/src/main.rs b/examples/benchmark_teensy40/src/main.rs
@@ -0,0 +1,102 @@
+//! Demonstrates a loopback UART peripheral.
+//!
+//! It uses the alpha board, with the following pinout:
+//!
+//! - Pin 14 is TX.
+//! - Pin 15 is RX.
+//!
+//! Baud rate is 115200bps.
+//!
+//! Every time you send the Teensy a character, it replies with
+//! that same character, and it toggles the LED.
+
+#![no_std]
+#![no_main]
+#![feature(core_intrinsics)]
+
+use teensy4_bsp as bsp;
+use teensy4_panic as _;
+
+use bsp::board;
+use bsp::hal;
+
+mod uart_writer;
+use uart_writer::UartWriter;
+
+mod run_bench;
+use run_bench::run_bench;
+
+#[bsp::rt::entry]
+fn main() -> ! {
+    let board::Resources {
+        pins,
+        mut gpio2,
+        lpuart2,
+        gpt1: mut us_timer,
+        ..
+    } = board::t40(board::instances());
+
+    // Initialize LED
+    let led = board::led(&mut gpio2, pins.p13);
+    led.set();
+
+    // Initialize UART
+    let mut uart = UartWriter::new(board::lpuart(lpuart2, pins.p14, pins.p15, 115200));
+    writeln!(uart);
+
+    // Initialize timer
+    // Is a 32-bit timer with us precision.
+    // Overflows every 71.58 minutes, which is sufficient for our benchmark.
+    assert_eq!(board::PERCLK_FREQUENCY, 1_000_000);
+    us_timer.set_clock_source(hal::gpt::ClockSource::PeripheralClock);
+    us_timer.set_divider(1);
+    us_timer.set_mode(hal::gpt::Mode::FreeRunning);
+    us_timer.enable();
+    let time_us = move || us_timer.count();
+
+    // Write welcome message
+    writeln!(uart, "===== Micromath Benchmark =====");
+    writeln!(uart, "Git Version: {}", git_version::git_version!());
+    writeln!(uart);
+
+    writeln!(uart, "All values in ns/iter.");
+    writeln!(uart);
+    write!(uart, "          ");
+    write!(uart, "  micromath");
+    write!(uart, "       libm");
+    write!(uart, " intrinsics");
+    writeln!(uart);
+
+    // Run benchmarks
+    run_bench!(time_us, uart, abs, fabsf, fabsf32);
+    run_bench!(time_us, uart, acos, acosf);
+    run_bench!(time_us, uart, asin, asinf);
+    run_bench!(time_us, uart, atan, atanf);
+    run_bench!(time_us, uart, atan_norm);
+    run_bench!(time_us, uart, ceil, ceilf);
+    run_bench!(time_us, uart, cos, cosf);
+    run_bench!(time_us, uart, exp, expf);
+    run_bench!(time_us, uart, floor, floorf);
+    run_bench!(time_us, uart, fract);
+    run_bench!(time_us, uart, inv);
+    run_bench!(time_us, uart, invsqrt);
+    run_bench!(time_us, uart, ln, logf);
+    run_bench!(time_us, uart, log2, log2f);
+    run_bench!(time_us, uart, log10, log10f);
+    run_bench!(time_us, uart, round);
+    run_bench!(time_us, uart, sin, sinf);
+    run_bench!(time_us, uart, sqrt, sqrtf, sqrtf32);
+    run_bench!(time_us, uart, tan, tanf);
+    run_bench!(time_us, uart, trunc, truncf);
+
+    // Blink with a cycle length of 2 seconds, to make it verifyable that
+    // our timer runs at the correct speed.
+    loop {
+        let time_s = time_us() / 1_000_000;
+        if time_s % 2 == 0 {
+            led.set();
+        } else {
+            led.clear();
+        }
+    }
+}
diff --git a/examples/benchmark_teensy40/src/run_bench.rs b/examples/benchmark_teensy40/src/run_bench.rs
@@ -0,0 +1,117 @@
+macro_rules! run_bench {
+    ($time_us:expr, $uart:expr, $f:ident) => {{
+        write!($uart, "{:<10}", core::stringify!($f));
+
+        run_bench!(@run_bench $time_us, $uart, |num| <f32 as micromath::F32Ext>::$f(num));
+
+        writeln!($uart);
+    }};
+
+    ($time_us:expr, $uart:expr, $f:ident, $flibm:ident) => {{
+        write!($uart, "{:<10}", core::stringify!($f));
+
+        run_bench!(@run_bench $time_us, $uart, |num| <f32 as micromath::F32Ext>::$f(num));
+        run_bench!(@run_bench $time_us, $uart, |num| libm::$flibm(num));
+
+        writeln!($uart);
+    }};
+
+    ($time_us:expr, $uart:expr, $f:ident, $flibm:ident, $fintr:ident) => {{
+        write!($uart, "{:<10}", core::stringify!($f));
+
+        run_bench!(@run_bench $time_us, $uart, |num| <f32 as micromath::F32Ext>::$f(num));
+        run_bench!(@run_bench $time_us, $uart, |num| libm::$flibm(num));
+        run_bench!(@run_bench $time_us, $uart, |num| unsafe{ core::intrinsics::$fintr(num) });
+
+        writeln!($uart);
+    }};
+
+    (@run_bench $time_us:expr, $uart:expr, $f:expr) => {{
+        const COUNT: u64 = 4096 * 64;
+
+        const VALUE: f32 = 0.12345;
+
+        if !$f(VALUE).is_finite() {
+            writeln!($uart, "  ERROR: {} did not produce a finite value!", core::stringify!($f));
+        }
+
+        // Warmup
+        for _ in 0..COUNT {
+            //run_bench!(@unroll_32 {
+                run_bench!(@iteration VALUE, $f);
+            //})
+        }
+
+        let t_start = $time_us();
+        for _ in 0..COUNT {
+            //run_bench!(@unroll_32 {
+                run_bench!(@iteration VALUE, $f);
+            //})
+        }
+        let t_end = $time_us();
+
+        // Warmup
+        for _ in 0..COUNT {
+            //run_bench!(@unroll_32 {
+                run_bench!(@iteration VALUE, |val| val);
+            //})
+        }
+
+        let t_empty_start = $time_us();
+        for _ in 0..COUNT {
+            //run_bench!(@unroll_32 {
+                run_bench!(@iteration VALUE, |val| val);
+            //})
+        }
+        let t_empty_end = $time_us();
+
+        let iterations: u64 = COUNT; // * 32;
+
+        let duration_us = u64::from(t_end - t_start);
+        let duration_us_empty = u64::from(t_empty_end - t_empty_start);
+
+        let duration_ps = (duration_us - duration_us_empty) * 1_000_000;
+        let ps_per_iter = duration_ps / iterations;
+
+        let rounding_corrected_ps_per_iter = ps_per_iter + 50;
+        let ns_per_iter_full = rounding_corrected_ps_per_iter / 1000;
+        let ns_per_iter_rest = (rounding_corrected_ps_per_iter % 1000) / 100;
+
+        // writeln!($uart, "    {} iterations in {} us ({} us empty)", iterations, duration_us, duration_us_empty);
+        // writeln!($uart, "    {} ps/iter", ps_per_iter);
+        write!($uart, "{:>9}.{}", ns_per_iter_full, ns_per_iter_rest);
+    }};
+
+    // (@unroll_64 $b:block) => {{
+    //     run_bench!(@unroll_32 $b);
+    //     run_bench!(@unroll_32 $b);
+    // }};
+    // (@unroll_32 $b:block) => {{
+    //     run_bench!(@unroll_16 $b);
+    //     run_bench!(@unroll_16 $b);
+    // }};
+    // (@unroll_16 $b:block) => {{
+    //     run_bench!(@unroll_8 $b);
+    //     run_bench!(@unroll_8 $b);
+    // }};
+    // (@unroll_8 $b:block) => {{
+    //     run_bench!(@unroll_4 $b);
+    //     run_bench!(@unroll_4 $b);
+    // }};
+    // (@unroll_4 $b:block) => {{
+    //     run_bench!(@unroll_2 $b);
+    //     run_bench!(@unroll_2 $b);
+    // }};
+    // (@unroll_2 $b:block) => {{
+    //     {$b}{$b}
+    // }};
+
+    (@iteration $val:expr, $f:expr) => {{
+        const NUM_CONST: f32 = $val;
+        let num = core::intrinsics::black_box(NUM_CONST);
+        let result = $f(num);
+        core::intrinsics::black_box(result);
+    }};
+}
+
+pub(crate) use run_bench;