improve performance by using three DMA channels

Previously when using an HSTX clock > sysclk/2,
there was a DMA underrun from time to time,
which limited the achievable data rate to
around 75 MByte/s. By using a third DMA channel
and employing some trickery to be still able to
use the DMA CRC sniffer, we now can achieve
128 MByte/s (or even more) by using sysclk/1
as HSTX clock.
The counter example has been updated to
generate those ~128 MByte/s.
This commit is contained in:
Steve Markgraf 2024-12-26 23:18:41 +01:00
parent ae01224d0d
commit bae93b3a87
4 changed files with 61 additions and 52 deletions

View file

@ -1,7 +1,7 @@
# hsdaoh-rp2350 - High Speed Data Acquisition over HDMI
## Stream up to 75 MByte/s from your Raspberry Pi Pico2 to your PC
## Stream up to 128 MByte/s from your Raspberry Pi Pico2 to your PC
Using $5 USB3 HDMI capture sticks based on the MacroSilicon MS2130, this project allows to stream out up to 75 MByte/s of real time data from an RP2350 (with overclocking) to a host computer with USB3.
Using $5 USB3 HDMI capture sticks based on the MacroSilicon MS2130, this project allows to stream out up to 128 MByte/s of real time data from an RP2350 (with overclocking) to a host computer with USB3.
For more information and the host library, see the [main repository](https://github.com/steve-m/hsdaoh) and the [talk at OsmoDevcon '24](https://media.ccc.de/v/osmodevcon2024-200-low-cost-high-speed-data-acquisition-over-hdmi).
![Raspberry Pi Pico2 with MS2130 stick](https://steve-m.de/projects/hsdaoh/pico2_hsdaoh.jpg)

View file

@ -43,7 +43,7 @@
#include "picohsdaoh.h"
#include "counter.pio.h"
#define SYS_CLK 250000
#define SYS_CLK 336000
#define DMACH_PIO_PING 0
#define DMACH_PIO_PONG 1
@ -116,12 +116,14 @@ void init_pio_input(void)
int main()
{
vreg_set_voltage(VREG_VOLTAGE_MAX);
sleep_ms(1);
set_sys_clock_khz(SYS_CLK, true);
/* set HSTX clock to sysclk/2 */
/* set HSTX clock to sysclk/1 */
hw_write_masked(
&clocks_hw->clk[clk_hstx].div,
2 << CLOCKS_CLK_HSTX_DIV_INT_LSB,
1 << CLOCKS_CLK_HSTX_DIV_INT_LSB,
CLOCKS_CLK_HSTX_DIV_INT_BITS
);

View file

@ -33,7 +33,7 @@ static inline void counter_program_init(PIO pio, uint sm, uint offset)
// disable the TX FIFO to make the RX FIFO deeper.
sm_config_set_fifo_join(&c, PIO_FIFO_JOIN_RX);
sm_config_set_clkdiv(&c, 4.f);
sm_config_set_clkdiv(&c, 1.75f);
// Load our configuration, and start the program from the beginning
pio_sm_init(pio, sm, offset, &c);

View file

@ -152,13 +152,16 @@ void hsdaoh_update_head(int head)
fifo_head = head;
}
#define DMACH_HSTX_PING 14
#define DMACH_HSTX_PONG 15
#define CRC16_INIT 0xffff
#define DMACH_HSTX_START 13
#define DMACH_HSTX_COUNT 3
#define CRC16_INIT 0xffff
static bool hstx_dma_pong = false;
static uint v_scanline = 2;
static uint8_t hstx_dma_curchan = 0;
static uint16_t saved_crc;
static uint v_scanline = 3;
static bool vactive_cmdlist_posted = false;
static uint8_t dma_sniff_pipelined_ch = 0;
static bool dma_sniff_pipelined_disable = false;
enum crc_config {
CRC_NONE, /* No CRC, just 16 bit idle counter */
@ -174,16 +177,32 @@ typedef struct
uint8_t crc_config;
} __attribute__((packed, aligned(1))) metadata_t;
metadata_t metadata = (metadata_t) { .magic = 0xda7acab1, .crc_config = CRC16_1_LINE };
metadata_t metadata = (metadata_t) { .magic = 0xda7acab1, .crc_config = CRC16_2_LINE };
/* HSTX DMA IRQ handler, reconfigures the channel that just completed while
* ther other channel is currently busy */
void __scratch_x("") hstx_dma_irq_handler()
{
uint ch_num = hstx_dma_pong ? DMACH_HSTX_PONG : DMACH_HSTX_PING;
/* This is a bit tricky and time critical, we pipeline three DMA transfers to avoid an
* underrun, but the DMA sniffer that is used to calculate the CRC cannot be pipelined
* and needs to be reconfigured right before the DMA transfer starts - so we have to
* do that as fast as possible during blanking, before the next DMA transfer with
* active video data, which is right about to start. */
if (dma_sniff_pipelined_ch) {
/* (re)initialize DMA CRC sniffer */
saved_crc = dma_sniffer_get_data_accumulator() & 0xffff;
dma_sniffer_set_data_accumulator(CRC16_INIT);
dma_sniffer_enable(dma_sniff_pipelined_ch, DMA_SNIFF_CTRL_CALC_VALUE_CRC16, true);
dma_sniff_pipelined_ch = 0;
} else if (dma_sniff_pipelined_disable) {
dma_sniffer_disable();
dma_sniff_pipelined_disable = false;
}
uint ch_num = hstx_dma_curchan + DMACH_HSTX_START;
hstx_dma_curchan = (hstx_dma_curchan + 1) % DMACH_HSTX_COUNT;
dma_channel_hw_t *ch = &dma_hw->ch[ch_num];
dma_hw->intr = 1u << ch_num;
hstx_dma_pong = !hstx_dma_pong;
/* for raw commands we need to use 32 bit DMA transfers */
ch->al1_ctrl = (ch->al1_ctrl & ~DMA_CH0_CTRL_TRIG_DATA_SIZE_BITS) | (DMA_SIZE_32 << DMA_CH0_CTRL_TRIG_DATA_SIZE_LSB);
@ -191,7 +210,7 @@ void __scratch_x("") hstx_dma_irq_handler()
if (v_scanline >= MODE_V_FRONT_PORCH && v_scanline < (MODE_V_FRONT_PORCH + MODE_V_SYNC_WIDTH)) {
/* on first line of actual VSYNC, output data packet */
if (v_scanline == MODE_V_FRONT_PORCH) {
dma_sniffer_disable();
dma_sniff_pipelined_disable = true;
ch->read_addr = (uintptr_t)info_p;
ch->transfer_count = info_len;
@ -235,12 +254,10 @@ void __scratch_x("") hstx_dma_irq_handler()
next_line[RBUF_SLICE_LEN - 1] |= ((met_p[cur_active_line/2] & 0x0f) << 12);
}
/* on the second last word of the line, insert the CRC16 of the entire previous line */
next_line[RBUF_SLICE_LEN - 2] = dma_sniffer_get_data_accumulator() & 0xffff;
/* on the second last word of the line, insert the CRC16 of the entire line before the last line */
next_line[RBUF_SLICE_LEN - 2] = saved_crc;
/* (re)initialize DMA CRC sniffer */
dma_sniffer_set_data_accumulator(CRC16_INIT);
dma_sniffer_enable(ch_num, DMA_SNIFF_CTRL_CALC_VALUE_CRC16, true);
dma_sniff_pipelined_ch = ch_num;
/* switch to 16 bit DMA transfer size for the actual data,
* because for YCbCr422 TMDS channel 0 is unused */
@ -266,7 +283,7 @@ void core1_entry()
void hsdaoh_start(void)
{
multicore_launch_core1(core1_entry);
dma_channel_start(DMACH_HSTX_PING);
dma_channel_start(DMACH_HSTX_START);
}
void hsdaoh_init(uint16_t *ringbuf)//struct hsdaoh_inst *inst, uint16_t *ringbuf)
@ -338,38 +355,28 @@ void hsdaoh_init(uint16_t *ringbuf)//struct hsdaoh_inst *inst, uint16_t *ringbuf
for (int i = 12; i <= 19; ++i)
gpio_set_function(i, 0); // HSTX
/* Both channels are set up identically, to transfer a whole scanline and
* then chain to the opposite channel. Each time a channel finishes, we
* reconfigure the one that just finished, meanwhile the opposite channel
/* All channels are set up identically, to transfer a whole scanline and
* then chain to the net channel. Each time a channel finishes, we
* reconfigure the one that just finished, meanwhile another channel
* is already making progress. */
dma_channel_config c;
c = dma_channel_get_default_config(DMACH_HSTX_PING);
channel_config_set_chain_to(&c, DMACH_HSTX_PONG);
channel_config_set_dreq(&c, DREQ_HSTX);
channel_config_set_sniff_enable(&c, true);
dma_channel_configure(
DMACH_HSTX_PING,
&c,
&hstx_fifo_hw->fifo,
vblank_line_vsync_off,
count_of(vblank_line_vsync_off),
false
);
c = dma_channel_get_default_config(DMACH_HSTX_PONG);
channel_config_set_chain_to(&c, DMACH_HSTX_PING);
channel_config_set_dreq(&c, DREQ_HSTX);
channel_config_set_sniff_enable(&c, true);
dma_channel_configure(
DMACH_HSTX_PONG,
&c,
&hstx_fifo_hw->fifo,
vblank_line_vsync_off,
count_of(vblank_line_vsync_off),
false
);
dma_hw->ints3 = (1u << DMACH_HSTX_PING) | (1u << DMACH_HSTX_PONG);
dma_hw->inte3 = (1u << DMACH_HSTX_PING) | (1u << DMACH_HSTX_PONG);
for (int i = 0; i < DMACH_HSTX_COUNT; i++) {
dma_channel_config c;
c = dma_channel_get_default_config(DMACH_HSTX_START + i);
int chain_to_ch = DMACH_HSTX_START + ((i + 1) % DMACH_HSTX_COUNT);
channel_config_set_chain_to(&c, chain_to_ch);
channel_config_set_dreq(&c, DREQ_HSTX);
channel_config_set_sniff_enable(&c, true);
dma_channel_configure(
DMACH_HSTX_START + i,
&c,
&hstx_fifo_hw->fifo,
vblank_line_vsync_off,
count_of(vblank_line_vsync_off),
false
);
dma_hw->ints3 |= 1u << (DMACH_HSTX_START + i);
dma_hw->inte3 |= 1u << (DMACH_HSTX_START + i);
}
/* give the DMA the priority over the CPU on the bus */
bus_ctrl_hw->priority = BUSCTRL_BUS_PRIORITY_DMA_W_BITS | BUSCTRL_BUS_PRIORITY_DMA_R_BITS;