improve performance by using three DMA channels

Previously when using an HSTX clock > sysclk/2,
there was a DMA underrun from time to time,
which limited the achievable data rate to
around 75 MByte/s. By using a third DMA channel
and employing some trickery to be still able to
use the DMA CRC sniffer, we now can achieve
128 MByte/s (or even more) by using sysclk/1
as HSTX clock.
The counter example has been updated to
generate those ~128 MByte/s.
This commit is contained in:
Steve Markgraf 2024-12-26 23:18:41 +01:00
parent ae01224d0d
commit bae93b3a87
4 changed files with 61 additions and 52 deletions

View file

@ -1,7 +1,7 @@
# hsdaoh-rp2350 - High Speed Data Acquisition over HDMI # hsdaoh-rp2350 - High Speed Data Acquisition over HDMI
## Stream up to 75 MByte/s from your Raspberry Pi Pico2 to your PC ## Stream up to 128 MByte/s from your Raspberry Pi Pico2 to your PC
Using $5 USB3 HDMI capture sticks based on the MacroSilicon MS2130, this project allows to stream out up to 75 MByte/s of real time data from an RP2350 (with overclocking) to a host computer with USB3. Using $5 USB3 HDMI capture sticks based on the MacroSilicon MS2130, this project allows to stream out up to 128 MByte/s of real time data from an RP2350 (with overclocking) to a host computer with USB3.
For more information and the host library, see the [main repository](https://github.com/steve-m/hsdaoh) and the [talk at OsmoDevcon '24](https://media.ccc.de/v/osmodevcon2024-200-low-cost-high-speed-data-acquisition-over-hdmi). For more information and the host library, see the [main repository](https://github.com/steve-m/hsdaoh) and the [talk at OsmoDevcon '24](https://media.ccc.de/v/osmodevcon2024-200-low-cost-high-speed-data-acquisition-over-hdmi).
![Raspberry Pi Pico2 with MS2130 stick](https://steve-m.de/projects/hsdaoh/pico2_hsdaoh.jpg) ![Raspberry Pi Pico2 with MS2130 stick](https://steve-m.de/projects/hsdaoh/pico2_hsdaoh.jpg)

View file

@ -43,7 +43,7 @@
#include "picohsdaoh.h" #include "picohsdaoh.h"
#include "counter.pio.h" #include "counter.pio.h"
#define SYS_CLK 250000 #define SYS_CLK 336000
#define DMACH_PIO_PING 0 #define DMACH_PIO_PING 0
#define DMACH_PIO_PONG 1 #define DMACH_PIO_PONG 1
@ -116,12 +116,14 @@ void init_pio_input(void)
int main() int main()
{ {
vreg_set_voltage(VREG_VOLTAGE_MAX);
sleep_ms(1);
set_sys_clock_khz(SYS_CLK, true); set_sys_clock_khz(SYS_CLK, true);
/* set HSTX clock to sysclk/2 */ /* set HSTX clock to sysclk/1 */
hw_write_masked( hw_write_masked(
&clocks_hw->clk[clk_hstx].div, &clocks_hw->clk[clk_hstx].div,
2 << CLOCKS_CLK_HSTX_DIV_INT_LSB, 1 << CLOCKS_CLK_HSTX_DIV_INT_LSB,
CLOCKS_CLK_HSTX_DIV_INT_BITS CLOCKS_CLK_HSTX_DIV_INT_BITS
); );

View file

@ -33,7 +33,7 @@ static inline void counter_program_init(PIO pio, uint sm, uint offset)
// disable the TX FIFO to make the RX FIFO deeper. // disable the TX FIFO to make the RX FIFO deeper.
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_RX); sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_RX);
sm_config_set_clkdiv(&c, 4.f); sm_config_set_clkdiv(&c, 1.75f);
// Load our configuration, and start the program from the beginning // Load our configuration, and start the program from the beginning
pio_sm_init(pio, sm, offset, &c); pio_sm_init(pio, sm, offset, &c);

View file

@ -152,13 +152,16 @@ void hsdaoh_update_head(int head)
fifo_head = head; fifo_head = head;
} }
#define DMACH_HSTX_PING 14 #define DMACH_HSTX_START 13
#define DMACH_HSTX_PONG 15 #define DMACH_HSTX_COUNT 3
#define CRC16_INIT 0xffff #define CRC16_INIT 0xffff
static bool hstx_dma_pong = false; static uint8_t hstx_dma_curchan = 0;
static uint v_scanline = 2; static uint16_t saved_crc;
static uint v_scanline = 3;
static bool vactive_cmdlist_posted = false; static bool vactive_cmdlist_posted = false;
static uint8_t dma_sniff_pipelined_ch = 0;
static bool dma_sniff_pipelined_disable = false;
enum crc_config { enum crc_config {
CRC_NONE, /* No CRC, just 16 bit idle counter */ CRC_NONE, /* No CRC, just 16 bit idle counter */
@ -174,16 +177,32 @@ typedef struct
uint8_t crc_config; uint8_t crc_config;
} __attribute__((packed, aligned(1))) metadata_t; } __attribute__((packed, aligned(1))) metadata_t;
metadata_t metadata = (metadata_t) { .magic = 0xda7acab1, .crc_config = CRC16_1_LINE }; metadata_t metadata = (metadata_t) { .magic = 0xda7acab1, .crc_config = CRC16_2_LINE };
/* HSTX DMA IRQ handler, reconfigures the channel that just completed while /* HSTX DMA IRQ handler, reconfigures the channel that just completed while
* ther other channel is currently busy */ * ther other channel is currently busy */
void __scratch_x("") hstx_dma_irq_handler() void __scratch_x("") hstx_dma_irq_handler()
{ {
uint ch_num = hstx_dma_pong ? DMACH_HSTX_PONG : DMACH_HSTX_PING; /* This is a bit tricky and time critical, we pipeline three DMA transfers to avoid an
* underrun, but the DMA sniffer that is used to calculate the CRC cannot be pipelined
* and needs to be reconfigured right before the DMA transfer starts - so we have to
* do that as fast as possible during blanking, before the next DMA transfer with
* active video data, which is right about to start. */
if (dma_sniff_pipelined_ch) {
/* (re)initialize DMA CRC sniffer */
saved_crc = dma_sniffer_get_data_accumulator() & 0xffff;
dma_sniffer_set_data_accumulator(CRC16_INIT);
dma_sniffer_enable(dma_sniff_pipelined_ch, DMA_SNIFF_CTRL_CALC_VALUE_CRC16, true);
dma_sniff_pipelined_ch = 0;
} else if (dma_sniff_pipelined_disable) {
dma_sniffer_disable();
dma_sniff_pipelined_disable = false;
}
uint ch_num = hstx_dma_curchan + DMACH_HSTX_START;
hstx_dma_curchan = (hstx_dma_curchan + 1) % DMACH_HSTX_COUNT;
dma_channel_hw_t *ch = &dma_hw->ch[ch_num]; dma_channel_hw_t *ch = &dma_hw->ch[ch_num];
dma_hw->intr = 1u << ch_num; dma_hw->intr = 1u << ch_num;
hstx_dma_pong = !hstx_dma_pong;
/* for raw commands we need to use 32 bit DMA transfers */ /* for raw commands we need to use 32 bit DMA transfers */
ch->al1_ctrl = (ch->al1_ctrl & ~DMA_CH0_CTRL_TRIG_DATA_SIZE_BITS) | (DMA_SIZE_32 << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB); ch->al1_ctrl = (ch->al1_ctrl & ~DMA_CH0_CTRL_TRIG_DATA_SIZE_BITS) | (DMA_SIZE_32 << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB);
@ -191,7 +210,7 @@ void __scratch_x("") hstx_dma_irq_handler()
if (v_scanline >= MODE_V_FRONT_PORCH && v_scanline < (MODE_V_FRONT_PORCH + MODE_V_SYNC_WIDTH)) { if (v_scanline >= MODE_V_FRONT_PORCH && v_scanline < (MODE_V_FRONT_PORCH + MODE_V_SYNC_WIDTH)) {
/* on first line of actual VSYNC, output data packet */ /* on first line of actual VSYNC, output data packet */
if (v_scanline == MODE_V_FRONT_PORCH) { if (v_scanline == MODE_V_FRONT_PORCH) {
dma_sniffer_disable(); dma_sniff_pipelined_disable = true;
ch->read_addr = (uintptr_t)info_p; ch->read_addr = (uintptr_t)info_p;
ch->transfer_count = info_len; ch->transfer_count = info_len;
@ -235,12 +254,10 @@ void __scratch_x("") hstx_dma_irq_handler()
next_line[RBUF_SLICE_LEN - 1] |= ((met_p[cur_active_line/2] & 0x0f) << 12); next_line[RBUF_SLICE_LEN - 1] |= ((met_p[cur_active_line/2] & 0x0f) << 12);
} }
/* on the second last word of the line, insert the CRC16 of the entire previous line */ /* on the second last word of the line, insert the CRC16 of the entire line before the last line */
next_line[RBUF_SLICE_LEN - 2] = dma_sniffer_get_data_accumulator() & 0xffff; next_line[RBUF_SLICE_LEN - 2] = saved_crc;
/* (re)initialize DMA CRC sniffer */ dma_sniff_pipelined_ch = ch_num;
dma_sniffer_set_data_accumulator(CRC16_INIT);
dma_sniffer_enable(ch_num, DMA_SNIFF_CTRL_CALC_VALUE_CRC16, true);
/* switch to 16 bit DMA transfer size for the actual data, /* switch to 16 bit DMA transfer size for the actual data,
* because for YCbCr422 TMDS channel 0 is unused */ * because for YCbCr422 TMDS channel 0 is unused */
@ -266,7 +283,7 @@ void core1_entry()
void hsdaoh_start(void) void hsdaoh_start(void)
{ {
multicore_launch_core1(core1_entry); multicore_launch_core1(core1_entry);
dma_channel_start(DMACH_HSTX_PING); dma_channel_start(DMACH_HSTX_START);
} }
void hsdaoh_init(uint16_t *ringbuf)//struct hsdaoh_inst *inst, uint16_t *ringbuf) void hsdaoh_init(uint16_t *ringbuf)//struct hsdaoh_inst *inst, uint16_t *ringbuf)
@ -338,38 +355,28 @@ void hsdaoh_init(uint16_t *ringbuf)//struct hsdaoh_inst *inst, uint16_t *ringbuf
for (int i = 12; i <= 19; ++i) for (int i = 12; i <= 19; ++i)
gpio_set_function(i, 0); // HSTX gpio_set_function(i, 0); // HSTX
/* Both channels are set up identically, to transfer a whole scanline and /* All channels are set up identically, to transfer a whole scanline and
* then chain to the opposite channel. Each time a channel finishes, we * then chain to the net channel. Each time a channel finishes, we
* reconfigure the one that just finished, meanwhile the opposite channel * reconfigure the one that just finished, meanwhile another channel
* is already making progress. */ * is already making progress. */
dma_channel_config c; for (int i = 0; i < DMACH_HSTX_COUNT; i++) {
c = dma_channel_get_default_config(DMACH_HSTX_PING); dma_channel_config c;
channel_config_set_chain_to(&c, DMACH_HSTX_PONG); c = dma_channel_get_default_config(DMACH_HSTX_START + i);
channel_config_set_dreq(&c, DREQ_HSTX); int chain_to_ch = DMACH_HSTX_START + ((i + 1) % DMACH_HSTX_COUNT);
channel_config_set_sniff_enable(&c, true); channel_config_set_chain_to(&c, chain_to_ch);
dma_channel_configure( channel_config_set_dreq(&c, DREQ_HSTX);
DMACH_HSTX_PING, channel_config_set_sniff_enable(&c, true);
&c, dma_channel_configure(
&hstx_fifo_hw->fifo, DMACH_HSTX_START + i,
vblank_line_vsync_off, &c,
count_of(vblank_line_vsync_off), &hstx_fifo_hw->fifo,
false vblank_line_vsync_off,
); count_of(vblank_line_vsync_off),
c = dma_channel_get_default_config(DMACH_HSTX_PONG); false
channel_config_set_chain_to(&c, DMACH_HSTX_PING); );
channel_config_set_dreq(&c, DREQ_HSTX); dma_hw->ints3 |= 1u << (DMACH_HSTX_START + i);
channel_config_set_sniff_enable(&c, true); dma_hw->inte3 |= 1u << (DMACH_HSTX_START + i);
dma_channel_configure( }
DMACH_HSTX_PONG,
&c,
&hstx_fifo_hw->fifo,
vblank_line_vsync_off,
count_of(vblank_line_vsync_off),
false
);
dma_hw->ints3 = (1u << DMACH_HSTX_PING) | (1u << DMACH_HSTX_PONG);
dma_hw->inte3 = (1u << DMACH_HSTX_PING) | (1u << DMACH_HSTX_PONG);
/* give the DMA the priority over the CPU on the bus */ /* give the DMA the priority over the CPU on the bus */
bus_ctrl_hw->priority = BUSCTRL_BUS_PRIORITY_DMA_W_BITS | BUSCTRL_BUS_PRIORITY_DMA_R_BITS; bus_ctrl_hw->priority = BUSCTRL_BUS_PRIORITY_DMA_W_BITS | BUSCTRL_BUS_PRIORITY_DMA_R_BITS;