Provide per-core thread local variables when using picolibc

Change the linker scripts to statically allocate thread local storage
space for each core and to initialize that using picolibc's _init_tls
instead wedging it between .data and .bss.

Implement pico-specific __aeabi_read_tp and_set_tls functions which
use the core ID register at 0xd0000000 to index an array of TLS base
pointers.

Perform per-core TLS region setup, calling _init_tls and _set_tls
using the areas allocated by the linker script.

Signed-off-by: Keith Packard <keithp@keithp.com>
This commit is contained in:
Keith Packard 2026-02-23 13:52:14 -08:00
parent b0685819e8
commit 49fa7fdb8a
8 changed files with 315 additions and 152 deletions

View file

@ -11,6 +11,7 @@
#include <time.h>
#include <sys/time.h>
#include <sys/times.h>
#include <picotls.h>
#include "pico.h"
#if LIB_PICO_STDIO
@ -135,26 +136,45 @@ void runtime_init(void) {
__libc_init_array();
}
#if !PICO_RUNTIME_NO_INIT_PER_CORE_TLS_SETUP
__weak void runtime_init_pre_core_tls_setup(void) {
// for now we just set the same global area on both cores
// note: that this is superfluous with the stock picolibc it seems, since it is itself
// using a version of __aeabi_read_tp that returns the same pointer on both cores
extern char __tls_base[];
extern void _set_tls(void *tls);
_set_tls(__tls_base);
/* The size of the thread control block.
* TLS relocations are generated relative to
* a location this far *before* the first thread
* variable (!)
* NB: The actual size before tp also includes padding
* to align up to the alignment of .tdata/.tbss.
*/
extern char __arm32_tls_tcb_offset;
#define TP_OFFSET ((size_t)&__arm32_tls_tcb_offset)
static void *__tls[2];
void _set_tls(void *tls) {
tls = (uint8_t *)tls - TP_OFFSET;
__tls[get_core_num()] = tls;
}
#endif
#if !PICO_RUNTIME_SKIP_INIT_PER_CORE_TLS_SETUP
PICO_RUNTIME_INIT_FUNC_PER_CORE(runtime_init_pre_core_tls_setup, PICO_RUNTIME_INIT_PER_CORE_TLS_SETUP);
#endif
/* Initialized by the linker, one per core */
extern char __tls0_base[], __tls1_base[];
static void * const __tls_base[2] = { __tls0_base, __tls1_base };
//// naked as it must preserve everything except r0 and lr
//uint32_t __attribute__((naked)) WRAPPER_FUNC(__aeabi_read_tp)() {
// // note for now we are just returning a shared instance on both cores
// pico_default_asm_volatile(
// "ldr r0, =__tls_base\n"
// "bx lr\n"
// );
//}
void runtime_init_per_core_tls_setup(void) {
void *tls_base = __tls_base[get_core_num()];
_init_tls(tls_base);
_set_tls(tls_base);
}
PICO_RUNTIME_INIT_FUNC_PER_CORE(runtime_init_per_core_tls_setup, PICO_RUNTIME_INIT_PER_CORE_TLS_SETUP);
uint32_t __aeabi_read_tp(void);
uint32_t __attribute__((naked)) __aeabi_read_tp(void) {
pico_default_asm_volatile(
"push {r1,lr} /* Save R1 (and LR) */\n"
"ldr r1,=0xd0000000 /* Address of SIO->CPUID */\n"
"ldr r1,[r1] /* Fetch active core */\n"
"lsls r1,r1,#2 /* Multiply by 4 */\n"
"ldr r0,=%0 /* Address of __tls array */\n"
"ldr r0,[r0,r1] /* Fetch __tls[CPUID] */\n"
"pop {r1,pc} /* Restore R1 and return */\n" : : "i" (__tls)
);
}

View file

@ -133,6 +133,30 @@ SECTIONS
} > FLASH
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -181,37 +205,34 @@ SECTIONS
. = ALIGN(4);
} > RAM AT> FLASH
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM AT> FLASH
PROVIDE(__data_end__ = .);
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss (NOLOAD) : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
. = ALIGN(4);
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;

View file

@ -88,6 +88,30 @@ SECTIONS
} > FLASH
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -182,37 +206,34 @@ SECTIONS
. = ALIGN(4);
} > RAM AT> FLASH
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM AT> FLASH
PROVIDE(__data_end__ = .);
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
. = ALIGN(4);
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;

View file

@ -133,6 +133,30 @@ SECTIONS
} > FLASH
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -181,37 +205,34 @@ SECTIONS
. = ALIGN(4);
} > RAM AT> FLASH
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM AT> FLASH
PROVIDE(__data_end__ = .);
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss (NOLOAD) : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
. = ALIGN(4);
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;
@ -273,9 +294,6 @@ SECTIONS
/* picolibc and LLVM */
PROVIDE (__heap_start = __end__);
PROVIDE (__heap_end = __HeapLimit);
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = (__tls_size + __tls_align - 1) & ~(__tls_align - 1));
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
/* llvm-libc */
PROVIDE (_end = __end__);

View file

@ -93,6 +93,30 @@ SECTIONS
} > RAM
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >RAM
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >RAM
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -145,12 +169,6 @@ SECTIONS
. = ALIGN(4);
} > RAM
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM
PROVIDE(__data_end__ = .);
.uninitialized_data (NOLOAD): {
@ -160,26 +178,29 @@ SECTIONS
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss (NOLOAD) : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
. = ALIGN(4);
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;
@ -236,9 +257,6 @@ SECTIONS
/* picolibc and LLVM */
PROVIDE (__heap_start = __end__);
PROVIDE (__heap_end = __HeapLimit);
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = (__tls_size + __tls_align - 1) & ~(__tls_align - 1));
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
/* llvm-libc */
PROVIDE (_end = __end__);

View file

@ -104,6 +104,30 @@ SECTIONS
} > FLASH
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -200,31 +224,14 @@ SECTIONS
. = ALIGN(4);
} > RAM AT> FLASH
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM AT> FLASH
PROVIDE(__data_end__ = .);
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
PROVIDE(__global_pointer$ = . + 2K);
@ -233,6 +240,20 @@ SECTIONS
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;

View file

@ -145,6 +145,30 @@ SECTIONS
} > FLASH
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -193,30 +217,14 @@ SECTIONS
. = ALIGN(4);
} > RAM AT> FLASH
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM AT> FLASH
PROVIDE(__data_end__ = .);
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss (NOLOAD) : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
@ -226,6 +234,20 @@ SECTIONS
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;

View file

@ -95,6 +95,30 @@ SECTIONS
} > RAM
__exidx_end = .;
/* Assign TLS offsets and load TLS initialization data */
.tdata :
{
*(.tdata .tdata.* .gnu.linkonce.td.*)
PROVIDE( __tdata_end = . );
} >FLASH
.tbss (NOLOAD) : {
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
PROVIDE( __tls_end = . );
PROVIDE( __tbss_end = . );
} >FLASH
PROVIDE( __tls_start = SIZEOF(.tdata) ? ADDR(.tdata) : ADDR(.tbss) );
PROVIDE( __tls_size = __tls_end - __tls_start );
PROVIDE( __tls_align = MAX(ALIGNOF(.tdata), ALIGNOF(.tbss)) );
PROVIDE( __tls_size_align = ALIGN( __tls_size, __tls_align) );
PROVIDE( __arm32_tls_tcb_offset = MAX(8, __tls_align) );
PROVIDE( __tdata_size = SIZEOF(.tdata) );
PROVIDE( __tdata_source = LOADADDR(.tdata) );
PROVIDE( __tbss_size = SIZEOF(.tbss) );
PROVIDE( __tbss_offset = ADDR(.tbss) - __tls_start );
/* Machine inspectable binary information */
. = ALIGN(4);
__binary_info_start = .;
@ -146,37 +170,21 @@ SECTIONS
*(.jcr)
. = ALIGN(4);
} > RAM
.tdata : {
. = ALIGN(4);
*(.tdata .tdata.* .gnu.linkonce.td.*)
/* All data end */
__tdata_end = .;
} > RAM
PROVIDE(__data_end__ = .);
.uninitialized_data (NOLOAD): {
. = ALIGN(4);
*(.uninitialized_data*)
} > RAM
/* __etext is (for backwards compatibility) the name of the .data init source pointer (...) */
__etext = LOADADDR(.data);
.tbss (NOLOAD) : {
. = ALIGN(4);
__bss_start__ = .;
__tls_base = .;
*(.tbss .tbss.* .gnu.linkonce.tb.*)
*(.tcommon)
__tls_end = .;
} > RAM
.bss (NOLOAD) : {
. = ALIGN(4);
__tbss_end = .;
__bss_start__ = .;
*(SORT_BY_ALIGNMENT(SORT_BY_NAME(.bss*)))
*(COMMON)
PROVIDE(__global_pointer$ = . + 2K);
@ -185,6 +193,20 @@ SECTIONS
__bss_end__ = .;
} > RAM
# Thread local storage static allocations, one per core
.tls0 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls0_base = .);
. = . + __tls_size_align;
} > RAM
.tls1 (NOLOAD) : {
. = ALIGN(__tls_align);
PROVIDE(__tls1_base = .);
. = . + __tls_size_align;
} > RAM
.heap (NOLOAD):
{
__end__ = .;