openwrt/target/linux/realtek/image/rt-loader/src/startup.S
Markus Stockhausen ccbff8bbdd realtek: add rt-loader (runtime loader)
The bootloader of many Realtek switches only supports gzipped kernel images.
With limited flash space that might get critical in future versions. For better
compression allow support for compressed images. For this a new loader was
developed. Several ideas have been taken over from the existing lzma loader
but this has been enhanced to make integration simpler. What is new:

- Loader is position independent. No need to define load addresses
- Loader identifies device memory on its own
- Loader uses "official" upstream kernel lzma uncompress
  https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/lib/decompress_unlzma.c
- Loader uses "official" UNMODIFIED nanoprintg that is used by several
  bare metal projects. https://github.com/charlesnicholson/nanoprintf

Compiled the loader ist just under 12KiB and during boot it will show:

rt-loader
Found RTL8380M (chip id 6275C) with 256MB
Relocate 2924240 bytes from 0x80100000 to 0x8fce0000
Extract kernel with 2900144 bytes from 0x8fce521c to 0x80100000...
Extracted kernel size is 9814907 bytes
Booting kernel from 0x80100000 ...

[    0.000000] Linux version 6.12.33 ...
[    0.000000] RTL838X model is 83806800
...

Signed-off-by: Markus Stockhausen <markus.stockhausen@gmx.de>
Link: https://github.com/openwrt/openwrt/pull/18397
Signed-off-by: Robert Marko <robimarko@gmail.com>
2025-06-28 16:14:55 +02:00

182 lines
4.4 KiB
ArmAsm

# rt-loader assembler startup code
# (c) 2025 Markus Stockhausen
#include "globals.h"
# This start code allows to run a position independent code (PIC) on bare metal. In that case
# all addresses are looked up via the global offset table (GOT). But that must be filled during
# this initialization sequence. Without a proper GOT using standard "la" instruction in the code
# will not work. Provide a macro that avoids the dependency.
.macro _LA reg, symbol
lui \reg, %hi(\symbol)
addi \reg, \reg, %lo(\symbol)
add \reg, $t9
.endm
.section .text
.globl _start
.ent _start
_start:
.set noreorder
# Determine current program load address and store it into t9.
bal _where_am_i
nop
_where_am_i:
move $t9, $ra
subu $t9, $t9, 0x8
# Check if this our first run (_kernel_load_addr = 0?)
_LA $t6, _kernel_load_addr
lw $t7, 0($t6)
bne $zero, $t7, _init_done
nop
# During first run store the current load address as the target kernel load address.
sw $t9, 0($t6)
# Same for the global variables in the BSS section. Clear them only during the first run. This
# way the "global program state" can be copied over to the relocation address.
_LA $t3, __bss_start
_LA $t4, __bss_end
_bss_zero:
beq $t3, $t4, _init_done
nop
sw $zero, 0($t3)
addiu $t3, $t3, 4
b _bss_zero
nop
_init_done:
# Code is running bare metal and no one initializes the global offset table. After the build
# process the table is relative to address 0x0. Starting from anywhere else breaks the program.
# A manual update is required during startup. Usually this is quite easy by simply adding the
# current load address to all entries.
# But this code relocates itself to another memory address and starts itself over. At the new
# address it will find a global offset table that fits to the previous execution. To solve this
# store a copy of the last load address in got_delta variable and only add the difference after
# a relocation. Sequence is as follows
#
# - U-Boot loads the code to 0x80100000
# - U-Boot runs the code at 0x80100000
# - code identifies its dynamic start_address = 0x80100000
# - code reads (initial) _got_delta = 0x00000000
# - code adds 0x80100000 to all GOT entries
# - code stores _got_delta with 0x80100000
# - code copies itself over to a new location 0x85000000
# - code starts itself from 0x85000000
# - code identifies its dynamic start_address = 0x85000000
# - code reads (pre-filled) _got_delta = 0x80100000
# - code adds 0x4f00000 (= 0x85000000 - 0x80100000) to all GOT entries
# - ...
#
_LA $t6, _got_delta
lw $t5, 0($t6)
subu $t7, $t9, $t5
sw $t9, 0($t6)
_LA $t3, __got_start
_LA $t4, __got_end
_got_patch:
beq $t3, $t4, _got_done
nop
lw $t5, 0($t3)
addu $t5, $t5, $t7
sw $t5, 0($t3)
addiu $t3, $t3, 4
b _got_patch
nop
_got_done:
# Linker attached kernel to end of package. Store addresses in global variables
_LA $t8, _my_load_addr
sw $t9, 0($t8)
_LA $t5, __kernel_data_start
_LA $t4, _kernel_data_addr
sw $t5, 0($t4)
_LA $t3, __kernel_data_end
subu $t3, $t3, $t5
_LA $t4, _kernel_data_size
sw $t3, 0($t4)
# Determine own code size by looking where BSS ends.
_LA $t3, __bss_end
subu $t6, $t3, $t9
_LA $t4, _my_load_size
sw $t6, 0($t4)
# Setup heap. It will start directly behind BSS
addiu $t3, MEMORY_ALIGNMENT
li $t4, ~(MEMORY_ALIGNMENT - 1)
and $t3, $t4
_LA $t5, _heap_addr
sw $t3, 0($t5)
li $t4, HEAP_SIZE
add $t3, $t4
_LA $t5, _heap_addr_max
sw $t3, 0($t5)
# Setup stack that is located on top of heap.
li $t4, STACK_SIZE
add $sp, $t3, $t4
# Adapt t9 so it points to main(). This is needed so main() can find the GOT via t9/gp
_LA $t8, main
move $t9, $t8
# Call main() with parameters a0, a3, __kernel_start, __kernel_end
bal main
nop
.end _start
.section .data
.align 4
# delta for global offset table initialization
_got_delta:
.word 0
# current heap address for malloc() / free()
.globl _heap_addr
_heap_addr:
.word 0
# maximum heap address
.globl _heap_addr_max
_heap_addr_max:
.word 0
# current program load address
.globl _my_load_addr
_my_load_addr:
.word 0
# total size of code including attached kernel and bss (uninitialized global variables)
.globl _my_load_size
_my_load_size:
.word 0
# target load address of kernel = this programs address during initial run
.globl _kernel_load_addr
_kernel_load_addr:
.word 0
# absolute start address of attached kernel
.globl _kernel_data_addr
_kernel_data_addr:
.word 0
# size of attached kernel
.globl _kernel_data_size
_kernel_data_size:
.word 0