[BBAI-64] Shared memory between R5F and Linux A72

I have been attempting to setup shared memory between a Linux program and code running on an R5F core. My current attempted setup is to have MCU2_0/R5F0_0 and Linux share a 16mb chunk of memory at 0x90000000. I feel like I am very close to getting things to work, not quite there yet. Probably have a device tree issue. If anyone has guidance that would be awesome.

Current status of things:

  • The shared memory at 0x90000000 seems to be half reserved. If I look at /proc/iomem, the 16mb
    at 0x90000000 is reserved. From the Linux kernel logs, not so much.
  • Below I have a test linux program that reads and writes to the memory at 0x90000000. As long
    as a don’t use weird alignment, the code works fine.
  • My R5F firmware will not start if I have the shared memory array uncommented. From system
    logs I see this error bad phdr da 0x90000000 mem 0x1000000. Seems to either be an issue
    with my linker script or device tree overlay.

The firmware that I have on the R5F is based off GitHub - FredEckert/bbai64_cortex-r5_example at r5_toggle. The linker script has been modified accordingly to link shared memory at address 0x90000000.

Below I have cut down versions of my R5F linker script, device tree overlay, R5F code, and test linux code:

Here is a cut down snippet showing the parts of my linker script for my R5F firmware that should setup shared memory:

/* R5 memory locations */
__DDR0_ALLOCATED_START = 0xA0000000;

__MCU2_0_ALLOCATED_START = __DDR0_ALLOCATED_START + 0x02000000; /* Same for kernel 5 and 6 */
__MCU2_0_EXT_DATA_BASE = __MCU2_0_ALLOCATED_START + 0x00100000;
__MCU2_0_R5F_MEM_TEXT_BASE = __MCU2_0_ALLOCATED_START + 0x00200000;
__MCU2_0_R5F_MEM_DATA_BASE = __MCU2_0_ALLOCATED_START + 0x00300000;
__MCU2_0_DDR_SPACE_BASE = __MCU2_0_ALLOCATED_START + 0x00400000;

#
# bla bla bla
#


MEMORY
{
#
# bla bla bla the obvious 
#

        /* Shared memory region at 0x90000000 */
	DDR0_SHARED_r5f0_0 (rwx) : ORIGIN = 0x90000000, LENGTH = 0x1000000 /* 16MB */
}

ENTRY("vectors")
SECTIONS
{
	/* The TI libraries will not link without *(.ARM.exidx*) discarded */
	/DISCARD/ : { 
		*(.ARM.exidx*) 
	}

#
# bla bla bla stuff was cut out here
#

	/* This memory is for Linux and the r5f0_0 core to share */
	.shared_memory_r5f0_0_and_linux : ALIGN(4096) {
		*(.shared_memory_r5f0_0*)
	} > DDR0_SHARED_r5f0_0
}

For my device tree overlay I have this

/dts-v1/;
/plugin/;

#include <dt-bindings/gpio/gpio.h>
#include <dt-bindings/interrupt-controller/arm-gic.h>
#include <dt-bindings/interrupt-controller/irq.h>
#include <dt-bindings/pinctrl/k3.h>
#include <dt-bindings/board/k3-j721e-bone-pins.h>
#include <dt-bindings/soc/ti,sci_pm_domain.h>
#include <dt-bindings/input/linux-event-codes.h>

&{/chosen} {
	overlays {
		MotorControl.kernel = __TIMESTAMP__;
	};
};

// enable epwm4b for use from r5f (irrelevant to this post, left here just because)
&bone_pwm_4 {
	status = "okay"; // EHRPWM4-B on P9-25
};


&reserved_memory {
	#address-cells = <2>;
	#size-cells = <2>;

	fragment@1 {
		target = <&reserved_memory>;
		__overlay__ {
			shared_memory_r5f_linux: buffer@90000000 {
				compatible = "shared-dma-pool";
				reg = <0x0 0x90000000 0x0 0x01000000>; // 16 MB region, gap in k3-j721e-rotos-memory-map.dtsi
				no-map;
				reusable;
				status = "okay";
			};
		};
	};
};

&mcu_r5fss0_core0 {
	fragment@2 {
		target = <&main_r5fss0_core0>;
		__overlay__ {
			memory-region = <&vision_apps_mcu_r5fss0_core0_dma_memory_region>,
							<&vision_apps_mcu_r5fss0_core0_memory_region>,
							<&shared_memory_r5f_linux>;
		};
	};
};

Here’s my R5F C code (Cut down for simplicity)

#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>

#include <ti/csl/csl_gpio.h>
#include <ti/csl/soc.h>


#define SHARED_MEM_SIZE 0x1000000
// if I uncomment this line the firmware crashes, does not even start
__attribute__((section(".shared_memory_r5f0_0_and_linux"), aligned(4096))) char shared_mem[SHARED_MEM_SIZE];

// if I uncomment this line things work fine, no crash
//__attribute__((section(".log_shared_mem"))) char shared_mem[SOME_SIZE_I_DO_NOT_REMEMBER];



int main()
{   
    printf("Program started from main\n");


#define DELAY 20000000
    for(int i = 0;; i++)
    {
        // Some IO test code was here
        for(volatile int j=0; j< DELAY; j++);

        // Attempt to use shared memory
        strcpy(shared_mem + 0x64, shared_mem);
        printf("%c\n", shared_mem[0]);
    }

    return 0;
}

R5F MPU memory regions:

MpuP_RegionConfig gMpuRegionConfig[] = {
    // position in this array MATTERS, later stuff wins....INCLUDING UNDEFINED STUFF!

    // Complete 32-bit address space
    {
        .baseAddr = 0x0u,
        .size = our_MpuP_RegionSize_4G,
        .attrs = {
            .isEnable = 1,
            .isCacheable = 0,
            .isBufferable = 0, // Maybe could be 1, this is the arm B bit https://developer.arm.com/documentation/ddi0460/d/System-Control/Register-descriptions/c6--MPU-memory-region-programming-registers?lang=en
            .isSharable = 0,
            .isExecuteNever = 0,
            .tex = 1, // 0
            .accessPerm = our_MpuP_AP_ALL_RW,
            .subregionDisableMask = 0x0u,
        },
    },
    // MSRAM region
    {
        .baseAddr = 0x70000000u,
        .size = our_MpuP_RegionSize_8M,
        .attrs = {
            .isEnable = 1,
            .isCacheable = 1,
            .isBufferable = 1,
            .isSharable = 0,
            .isExecuteNever = 0,
            .tex = 7, // 5
            .accessPerm = our_MpuP_AP_ALL_RW,
            .subregionDisableMask = 0x0u,
        },
    },
    // DDR region
    {
        .baseAddr = 0x80000000u,
        .size = our_MpuP_RegionSize_2G,
        .attrs = {
            .isEnable = 1,
            .isCacheable = 1,
            .isBufferable = 1,
            .isSharable = 0, // 1... If this is set to `1` the R5 runs very slow, like 30x slower
            .isExecuteNever = 0,
            .tex = 7, // 5
            .accessPerm = our_MpuP_AP_ALL_RW,
            .subregionDisableMask = 0x0u
        },
    },
    // rpmsg region
    {
        .baseAddr = 0xA2000000u,
        .size = our_MpuP_RegionSize_1M,
        .attrs = {
            .isEnable = 1,
            .isCacheable = 0,
            .isBufferable = 0,
            .isSharable = 1,
            .isExecuteNever = 1,
            .tex = 1,
            .accessPerm = our_MpuP_AP_ALL_RW,
            .subregionDisableMask = 0x0u
        },
    },
    // r5f-linux shared memory region
    {
        .baseAddr = 0x90000000u,
        .size = our_MpuP_RegionSize_16M,
        .attrs = {
            .isEnable = 1,
            .isCacheable = 0,
            .isBufferable = 0,
            .isSharable = 1,
            .isExecuteNever = 1,
            .tex = 1,
            .accessPerm = our_MpuP_AP_ALL_RW,
            .subregionDisableMask = 0x0u
        },
    },
};

Here’s my linux code, works fine. It seems that I have to write on some kinda of alignment…

#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>


#define SHARED_MEM_BASE 0x90000000
#define SHARED_MEM_SIZE 0x1000000

int main() {
    int fd = open("/dev/mem", O_RDWR | O_SYNC);
    if (fd < 0) {
        perror("open");
        exit(EXIT_FAILURE);
    }

    // Map shared memory
    void* mapped_base = mmap(NULL, SHARED_MEM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd, SHARED_MEM_BASE);
    if (mapped_base == (void *) -1) {
        perror("mmap");
        close(fd);
        exit(EXIT_FAILURE);
    }


    // write and read from shared memory
    const char data[4096] = "Tux was here";
    printf("Made it to line %d\n",__LINE__);
    fflush(stdout);
    *((unsigned char *) mapped_base) = 0x55 + *((unsigned char *) mapped_base);
    printf("Read from shared memory: %c\n", *((unsigned char *) mapped_base));


    
    // memcpy(mapped_base, data, 31); // bus error
    // memcpy(mapped_base, data, 3); // bus error
    // memcpy(1 + mapped_base, data, 1); // bus error
    memcpy(mapped_base, data, 1); // works
    memcpy(mapped_base, data, 2); // works
    memcpy(mapped_base, data, 4); // works
    memcpy(mapped_base, data, 32); // works
    memcpy(mapped_base, data, 4096); // works
    printf("Read from shared memory: %c\n", *((unsigned char *) mapped_base)); // works
    // printf("Written to shared memory: %s\n", ((unsigned char *) mapped_base)); will crash with bus error


    // for (size_t i = 0; i < 10000; i++)
    // {
    //     printf("Read from %d: %s\n", SHARED_MEM_BASE + 0x100, mapped_base + 0x100);
    // }
    

    // Clean up
    if (munmap(mapped_base, SHARED_MEM_SIZE) == -1) {
        perror("munmap");
    }
    close(fd);

    printf("Yay I made it to the end\n");
    return 0;
}


Error logs and so on:

Here’s journalctl after an attempt to run my r5f code. There’s a bad phdr error.

debian@BeagleBone:~/r5f_firmware$ sudo journalctl -k | tail -5
Dec 13 18:55:16 BeagleBone kernel: remoteproc remoteproc16: powering up 5c00000.r5f
Dec 13 18:55:16 BeagleBone kernel: remoteproc remoteproc16: Booting fw image r5f_r5f0_0_kernel_5.elf, size 22812300
Dec 13 18:55:16 BeagleBone kernel: remoteproc remoteproc16: bad phdr da 0x90000000 mem 0x1000000
Dec 13 18:55:16 BeagleBone kernel: remoteproc remoteproc16: Failed to load program segments: -22
Dec 13 18:55:16 BeagleBone kernel: remoteproc remoteproc16: Boot failed: -22

Here is /proc/iomem showing that 0x90000000 has been successfully reserved by my device tree overlay:

debian@BeagleBone:~/r5f_firmware/tmp_linux_shared_mem_test$ sudo cat /proc/iomem | tail -33
70000000-707fffff : 70000000.sram sram@70000000
80000000-8fffffff : System RAM
  82000000-8364ffff : Kernel code
  83650000-839fffff : reserved
  83a00000-83cfffff : Kernel data
  8f6a9000-8f6dcfff : reserved
  8f6e1000-8fffffff : reserved
90000000-90ffffff : reserved.................(LOOK AT THIS!)
91000000-9e7fffff : System RAM
  91800000-9e7fffff : reserved
9e800000-aeffffff : reserved
af000000-afffffff : System RAM
b0000000-b7ffffff : reserved
b8000000-d7ffffff : System RAM
  b8000000-d7ffffff : reserved
d8000000-fcffffff : reserved
fd000000-ffffffff : System RAM
880000000-88fffffff : reserved
890000000-8ffffffff : System RAM
  8fb000000-8feffffff : reserved
  8ff2fc000-8ff35bfff : reserved
  8ff35c000-8ff95cfff : reserved
  8ff95d000-8ff99cfff : reserved
  8ff99f000-8ff9a0fff : reserved
  8ff9a1000-8ff9a1fff : reserved
  8ff9a2000-8ff9a7fff : reserved
  8ff9a8000-8ffffffff : reserved
4d80800000-4d80847fff : 4d80800000.dsp
4d80e00000-4d80e07fff : 4d80800000.dsp
4d80f00000-4d80f07fff : 4d80800000.dsp
4d81800000-4d81847fff : 4d81800000.dsp
4d81e00000-4d81e07fff : 4d81800000.dsp
4d81f00000-4d81f07fff : 4d81800000.dsp

Here’s some filtered output from dmesg showing that 0x90000000 does not seem to be reserved, or something:

debian@BeagleBone:~/r5f_firmware$ sudo dmesg | egrep "reserved mem|90000000"
[    0.000000] OF: reserved mem: node buffer@90000000 compatible matching fail
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a0000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a0100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a1000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a1100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a2000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a2100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a4000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a4100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a6000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a6100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-dma-memory@a7000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-memory@a7100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c66-dma-memory@a8000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c66-memory@a8100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c66-dma-memory@a9000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c66-memory@a9100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c71-dma-memory@aa000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-c71-memory@aa100000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-dma-memory@b2000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision_apps_shared-memories, compatible id dma-heap-carveout
[    0.000000] OF: reserved mem: initialized node vision-apps-core-heap-memory-lo@d8000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-virtual-eth-queues@fb000000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-r5f-virtual-eth-buffers@fb800000, compatible id shared-dma-pool
[    0.000000] OF: reserved mem: initialized node vision-apps-core-heap-memory-hi@880000000, compatible id shared-dma-pool
[    0.000000]   node   0: [mem 0x0000000090000000-0x0000000090ffffff]
[    0.000000]   node   0: [mem 0x0000000890000000-0x00000008ffffffff]
[    7.548157] k3-dsp-rproc 4d80800000.dsp: assigned reserved memory node vision-apps-c66-dma-memory@a9000000
[    7.627898] k3-dsp-rproc 4d81800000.dsp: assigned reserved memory node vision-apps-c66-dma-memory@a8000000
[    7.735405] k3-dsp-rproc 64800000.dsp: assigned reserved memory node vision-apps-c71-dma-memory@aa000000
[    8.361383] platform 41000000.r5f: assigned reserved memory node vision-apps-r5f-dma-memory@a0000000
[    8.368685]  remoteproc15#vdev0buffer: assigned reserved memory node vision-apps-r5f-dma-memory@a0000000
[    8.381330] platform 5c00000.r5f: assigned reserved memory node vision-apps-r5f-dma-memory@a2000000
[    8.407434] platform 5d00000.r5f: assigned reserved memory node vision-apps-r5f-dma-memory@a4000000
[    8.455005] platform 5e00000.r5f: assigned reserved memory node vision-apps-r5f-dma-memory@a6000000
[    8.497513] platform 5f00000.r5f: assigned reserved memory node vision-apps-r5f-dma-memory@a7000000
[   62.183218] remoteproc remoteproc16: bad phdr da 0x90000000 mem 0x1000000

Random notes;
My Beaglebone is using kernel 5.10, the firmware image is from April 2023.
I am very new to device trees and shared memory, learning this all through struggling with my BeagleBone.

What happens if you just read from 0x90000000 with the R5F using a pointer? I suspect it may just work…

It just worked, well kinda

The shared memory buffer is picky. It seems that all accesses must be 16bit aligned. Many of the C library functions cause bus errors. I had to write my own simple strlen and memcpy; will need to do more. Have not tried unaligned memory from the R5F side, only from Linux so far.

It’s really weird that __attribute__((section(".shared_memory_r5f0_0_and_linux"), aligned(4096))) char shared_mem[SHARED_MEM_SIZE]; breaks things. Maybe I need to make sure it’s clear that this array should not be initialized at ELF load / at all.

Two way shared mem example

I got a two-way shared memory example working. The Linux cores reads an int from one place, adds 1, and then writes it to another place constantly in a loop. The R5F core copies back to the original place.

Linux code:

#include <stdio.h>
#include <fcntl.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <errno.h>
#include <stdlib.h>
#include <string.h>
#include <stddef.h>
#include <stdint.h>
#include <errno.h>

#define SHARED_MEM_PHYSICAL_BASE 0x90000000
#define SHARED_MEM_SIZE 0x1000000


static size_t full_pages(const size_t size)
{
    size_t page = sysconf(_SC_PAGESIZE);
    if (size < page)
        return page;
    else
    if (size % page)
        return size + page - (size % page);
    else
        return size;
}

void *shared_mem_mapping = NULL;
int shared_mem_fd = -1;

void *setup_shared_buffer(){
    size_t shared_size = full_pages(SHARED_MEM_SIZE);
    if (shared_size != SHARED_MEM_SIZE) {
        printf("Error: Shared memory not paged aligned, requested size: %d, smaller aligned size: %ld\n", SHARED_MEM_SIZE, shared_size);
    }
    int shared_mem_fd = open("/dev/mem", O_RDWR | O_SYNC);
    if (shared_mem_fd < 0) {
        perror("open");
        exit(EXIT_FAILURE);
    }

    // Probably not needed
    ftruncate(shared_mem_fd, SHARED_MEM_SIZE);

    // Map shared memory
    void *shared_mem_mapping = mmap(NULL, SHARED_MEM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, shared_mem_fd, SHARED_MEM_PHYSICAL_BASE);
    if (shared_mem_mapping == MAP_FAILED) {
        perror("mmap");
        close(shared_mem_fd);
        exit(EXIT_FAILURE);
    }

    printf("Setup memory at physical address %x setup successfully\n", SHARED_MEM_PHYSICAL_BASE);
    fflush(stdout);

    return shared_mem_mapping;
}

void deinit_shared_mem() {
    if (munmap(shared_mem_mapping, SHARED_MEM_SIZE) == -1) {
        perror("munmap");
    }
    close(shared_mem_fd);
}

int main() {
    char *mapped_base = setup_shared_buffer();

    for(int i = 0;; i++){

        // grab data from mapped_base + 8000, add 1, then write to mapped_base + 12000. The r5f core will copy back.
        *((int *)mapped_base + 3000) = *((int *)mapped_base + 2000) + 1;
        printf("%p: %d, %p: %d\n", (int *)mapped_base + 3000, *((int *)mapped_base + 3000), (int *)mapped_base + 2000, *((int *)mapped_base + 2000) + 1);

        printf("Sleeping %d\n", i);
        sleep(1);
    }

    return 0;
}

R5F code:

int main()
{   
    #define DELAY 20000000
    for(int i = 0;; i++)
    {
        for(volatile int j=0; j< DELAY; j++);

        char *mapped_base = (char *)0x90000000;
        *((int *)mapped_base + 2000) = *((int *)mapped_base + 3000);
        printf("%p: %d, %p: %d\n", (int *)mapped_base + 3000, *((int *)mapped_base + 3000), (int *)mapped_base + 2000, *((int *)mapped_base + 2000) + 1);
    }
}

I now need to figure out cache coherency stuff. Maybe I could setup and mutex or semaphore for Linux and my R5F firmware to use

3 Likes