From a9bc7252091895799908c8e0bb583acc55828c5d Mon Sep 17 00:00:00 2001
From: Steve Markgraf <steve@steve-m.de>
Date: Sat, 7 Dec 2024 23:44:30 +0100
Subject: [PATCH] hdmi: switch to pipelined TMDS encoder

This drastically improves timing and performance, especially on
Tang Nano 9K and 4K.
---
 common/hdmi/hdmi.v         |   1 +
 common/hdmi/tmds_channel.v | 380 +++++++++++++++++++++++++++----------
 2 files changed, 280 insertions(+), 101 deletions(-)

diff --git a/common/hdmi/hdmi.v b/common/hdmi/hdmi.v
index 4879429..22c7bda 100644
--- a/common/hdmi/hdmi.v
+++ b/common/hdmi/hdmi.v
@@ -280,6 +280,7 @@ module hdmi (
 		for (i = 0; i < NUM_CHANNELS; i = i + 1) begin : tmds_gen
 			tmds_channel #(.CN(i)) tmds_channel(
 				.clk_pixel(clk_pixel),
+				.reset(reset),
 				.video_data(video_data[(i * 8) + 7:i * 8]),
 				.data_island_data(data_island_data[(i * 4) + 3:i * 4]),
 				.control_data(control_data[(i * 2) + 1:i * 2]),
diff --git a/common/hdmi/tmds_channel.v b/common/hdmi/tmds_channel.v
index 6fee6cd..db8841a 100644
--- a/common/hdmi/tmds_channel.v
+++ b/common/hdmi/tmds_channel.v
@@ -1,111 +1,71 @@
-// Implementation of HDMI Spec v1.4a Section 5.4: Encoding, Section 5.2.2.1: Video Guard Band, Section 5.2.3.3: Data Island Guard Bands.
-// By Sameer Puri https://github.com/sameer
-// source: https://github.com/hdl-util/hdmi/
+// Pipelined TMDS encoder
+// based on https://github.com/juj/gowin_flipflop_drainer/blob/main/src/hdmi.v (Public Domain)
+//
+// TERC4 + Video/Data Guard bands based on https://github.com/hdl-util/hdmi/
+// by Sameer Puri
+//
+// adapted for hsdaoh by Steve Markgraf <steve@steve-m.de>
+//
 // Dual-licensed under Apache License 2.0 and MIT License.
 
-// converted to Verilog for hsdaoh
-
-module tmds_channel (
-	clk_pixel,
-	video_data,
-	data_island_data,
-	control_data,
-	mode,
-	tmds
+// tmds_encoder performs Transition-minimized differential signaling (TMDS) encoding of
+// 8-bits of pixel data and 2-bits of control data to a 10-bit TMDS encoded format.
+module tmds_channel(
+	input clk_pixel,              // HDMI pixel clock
+	input reset,                  // reset (active high)
+	input [7:0] video_data,       // Input 8-bit color
+	input [3:0] data_island_data, // HDMI data island data
+	input [1:0] control_data,     // control data (vsync and hsync)
+	input [2:0] mode,             // Mode select (0 = control, 1 = video, 2 = video guard, 3 = island, 4 = island guard)
+	output reg [9:0] tmds         // encoded 10-bit TMDS data
 );
+
 	// TMDS Channel number.
 	// There are only 3 possible channel numbers in HDMI 1.4a: 0, 1, 2.
 	parameter [1:0] CN = 0;
 
-	input wire clk_pixel;
-	input wire [7:0] video_data;
-	input wire [3:0] data_island_data;
-	input wire [1:0] control_data;
-	input wire [2:0] mode; // Mode select (0 = control, 1 = video, 2 = video guard, 3 = island, 4 = island guard)
-	output reg [9:0] tmds = 10'b1101010100;
+	// Intermediate pipelined variables: the number after each reg specifies the clock cycle of the pipeline the values are accessed at.
 
-	// See Section 5.4.4.1
-	// Below is a direct implementation of Figure 5-7, using the same variable names.
-	reg signed [4:0] acc = 5'sd0;
+	// Reset
+	reg rst0;
+	// Unencoded input data
+	reg [7:0] dat0, dat1, dat2, dat3, dat4, dat5, dat6, dat7 ;
+	// Control signal (hsync and vsync)
+	reg [1:0] ctl0, ctl1, ctl2, ctl3, ctl4, ctl5, ctl6, ctl7, ctl8, ctl9, ctl10, ctl11, ctl12, ctl13, ctl14, ctl15, ctl16, ctl17, ctl18, ctl19;
+	// Output mode
+	reg [2:0] mode0, mode1, mode2, mode3, mode4, mode5, mode6, mode7, mode8, mode9, mode10, mode11, mode12, mode13, mode14, mode15, mode16, mode17, mode18, mode19;
+	// Data island data
+	reg [3:0] di_dat0, di_dat1, di_dat2, di_dat3, di_dat4, di_dat5, di_dat6, di_dat7, di_dat8, di_dat9, di_dat10, di_dat11, di_dat12, di_dat13, di_dat14, di_dat15, di_dat16, di_dat17, di_dat18, di_dat19;
+	// Display enable signal
+	reg den0, den1, den2, den3, den4, den5, den6, den7, den8, den9, den10, den11, den12, den13, den14, den15, den16, den17, not_den18;
+	// Parity count of input data
+	reg [4:0] par1, par2, par3, par4, par5, par6, par7, par8;
+	// Parity bit of input data (if set, input had >= 4 bits set).
+	reg par9, par10, par11, par12, par13, par14, par15, par16, par17, par18;
+	// Intermediate encoded stage of the input vector.
+	reg [7:0] enc3, enc4, enc5, enc6, enc7, enc8, enc9, enc10, enc11, enc12, enc13, enc14, enc15, enc16, enc17, enc18;
+	// Count the number of ones in the intermediate encoded data
+	reg signed [3:0] eon10, eon11, eon13, eon14, eon15, eon16, eon17, eon18;
+	// Is Encoded ONes even?
+	reg eve18;
+	// Temp values for accumulating the count of ones in the encoded vector.
+	reg [3:0] tpa10, tpa11, tpb11;
+	reg [2:0] tpa12, tpb12;
+	// Pipelined values for updating the bias count.
+	reg signed [3:0] inv18, shr18, shl18;
+	// Pipelined values for the output TMDS data.
+	reg [9:0] tmds_blank18, tmds_even18, tmds_pos18, tmds_neg18;
+	// 'bias' stores the running TMDS ones vs zeros balance count. If > 0, we've sent more ones to the bus,
+	// if < 0, we've sent more zeroes than ones, if == 0, we are at equal balance.
+	reg signed [3:0] bias;
 
-	reg [8:0] q_m;
-	reg [9:0] q_out;
-	wire [9:0] video_coding;
-	assign video_coding = q_out;
-
-	reg [3:0] N1D;
-	reg signed [4:0] N1q_m07;
-	reg signed [4:0] N0q_m07;
-	always @(*) begin
-		N1D = video_data[0] + video_data[1] + video_data[2] + video_data[3] + video_data[4] + video_data[5] + video_data[6] + video_data[7];
-		case (q_m[0] + q_m[1] + q_m[2] + q_m[3] + q_m[4] + q_m[5] + q_m[6] + q_m[7])
-			4'b0000: N1q_m07 = 5'sd0;
-			4'b0001: N1q_m07 = 5'sd1;
-			4'b0010: N1q_m07 = 5'sd2;
-			4'b0011: N1q_m07 = 5'sd3;
-			4'b0100: N1q_m07 = 5'sd4;
-			4'b0101: N1q_m07 = 5'sd5;
-			4'b0110: N1q_m07 = 5'sd6;
-			4'b0111: N1q_m07 = 5'sd7;
-			4'b1000: N1q_m07 = 5'sd8;
-			default: N1q_m07 = 5'sd0;
-		endcase
-		N0q_m07 = 5'sd8 - N1q_m07;
-	end
-	reg signed [4:0] acc_add;
-	integer i;
-	always @(*) begin
-		if ((N1D > 4'd4) || ((N1D == 4'd4) && (video_data[0] == 1'd0))) begin
-			q_m[0] = video_data[0];
-			for (i = 0; i < 7; i = i + 1)
-				q_m[i + 1] = q_m[i] ~^ video_data[i + 1];
-			q_m[8] = 1'b0;
-		end
-		else begin
-			q_m[0] = video_data[0];
-			for (i = 0; i < 7; i = i + 1)
-				q_m[i + 1] = q_m[i] ^ video_data[i + 1];
-			q_m[8] = 1'b1;
-		end
-		if ((acc == 5'sd0) || (N1q_m07 == N0q_m07)) begin
-			if (q_m[8]) begin
-				acc_add = N1q_m07 - N0q_m07;
-				q_out = {~q_m[8], q_m[8], q_m[7:0]};
-			end
-			else begin
-				acc_add = N0q_m07 - N1q_m07;
-				q_out = {~q_m[8], q_m[8], ~q_m[7:0]};
-			end
-		end
-		else if (((acc > 5'sd0) && (N1q_m07 > N0q_m07)) || ((acc < 5'sd0) && (N1q_m07 < N0q_m07))) begin
-			q_out = {1'b1, q_m[8], ~q_m[7:0]};
-			acc_add = (N0q_m07 - N1q_m07) + (q_m[8] ? 5'sd2 : 5'sd0);
-		end
-		else begin
-			q_out = {1'b0, q_m[8], q_m[7:0]};
-			acc_add = (N1q_m07 - N0q_m07) - (~q_m[8] ? 5'sd2 : 5'sd0);
-		end
-	end
-
-	always @(posedge clk_pixel) acc <= (mode != 3'd1 ? 5'sd0 : acc + acc_add);
-
-	// See Section 5.4.2
-	reg [9:0] control_coding;
-	always @(*)
-	begin
-		case (control_data)
-			2'b00: control_coding = 10'b1101010100;
-			2'b01: control_coding = 10'b0010101011;
-			2'b10: control_coding = 10'b0101010100;
-			2'b11: control_coding = 10'b1010101011;
-		endcase
-	end
+	reg [9:0] tmds_video;
 
 	// See Section 5.4.3
 	reg [9:0] terc4_coding;
 	always @(*)
 	begin
-		case (data_island_data)
+		case (di_dat19)
 			4'b0000: terc4_coding = 10'b1010011100;
 			4'b0001: terc4_coding = 10'b1001100011;
 			4'b0010: terc4_coding = 10'b1011100100;
@@ -143,16 +103,234 @@ module tmds_channel (
 			assign data_guard_band = 10'b0100110011;
 		end
 		else begin : genblk2
-			assign data_guard_band = (control_data == 2'b00 ? 10'b1010001110 : (control_data == 2'b01 ? 10'b1001110001 : (control_data == 2'b10 ? 10'b0101100011 : 10'b1011000011)));
+			assign data_guard_band = (ctl19 == 2'b00 ? 10'b1010001110 : (ctl19 == 2'b01 ? 10'b1001110001 : (ctl19 == 2'b10 ? 10'b0101100011 : 10'b1011000011)));
 		end
 	endgenerate
 
-	// Apply selected mode.
-	always @(posedge clk_pixel)
-	begin
-		case (mode)
-			3'd0: tmds <= control_coding;
-			3'd1: tmds <= video_coding;
+	always @(posedge clk_pixel) begin
+		// Clock 0: register inputs
+		rst0 <= reset;
+		dat0 <= video_data;
+		di_dat0 <= data_island_data;
+		ctl0 <= control_data;
+		den0 <= (mode == 1); // display enable (high=pixel data active. low=display is in blanking area)
+		mode0 <= mode;
+
+		// Clock 1: handle reset early by folding it into the other fields
+		dat1 <= dat0;
+		ctl1 <= rst0 ? 2'b0 : ctl0;
+		den1 <= rst0 ? 1'b0 : den0;
+		mode1 <= rst0 ? 3'b0 : mode0;
+
+		// Clock 2: sanitize image data to zero if inside display blank (or reset)
+		dat2 <= den1 ? dat1 : 8'b0;
+		ctl2 <= ctl1;
+		den2 <= den1;
+		mode2 <= mode1;
+
+		// Clocks 3-7: Pipeline 'dat' for the duration of the parity encoding below.
+		dat3 <= dat2;
+		dat4 <= dat3;
+		dat5 <= dat4;
+		dat6 <= dat5;
+		dat7 <= dat6;
+
+		// Clocks 1-8: Calculate parity, i.e. whether the input vector 'dat' has more
+		//             ones in it than zeros. If it has 4 zeros and 4 ones, use ~dat[0]
+		//             as a tie. To do that, start with constant vector 00001, and for
+		//             each bit set in input 'dat', shift 'par' left by one place, filling
+		//             in ones. At the end par[4] will specifies whether there were more
+		//             ones than zeroes.
+		par1 <= 5'b00001;
+		par2 <= dat1[1] ? {par1[3:0], 1'b1} : par1; // = 000ab (a,b=unknown, 000=zeroes)
+		par3 <= dat2[2] ? {par2[3:0], 1'b1} : par2; // = 00abc
+		par4 <= dat3[3] ? {par3[3:0], 1'b1} : par3; // = 0abcd
+		par5 <= dat4[4] ? {par4[3:0], 1'b1} : par4; // = abcdx (x=don't care, rely on optimizer to clear these away)
+		par6 <= dat5[5] ? {par5[3:0], 1'b1} : par5; // = bcdxx
+		par7 <= dat6[6] ? {par6[3:0], 1'b1} : par6; // = cdxxx
+		par8 <= dat7[7] ? {par7[3:0], 1'b1} : par7; // = dxxxx
+
+		// Clocks 9-18: No further calculation needed for parity. Keep pipelining it forward
+		//              in a single bit vector.
+		par9 <= par8[4]; // At the end of computation par[4] records the parity.
+		par10 <= par9;
+		par11 <= par10;
+		par12 <= par11;
+		par13 <= par12;
+		par14 <= par13;
+		par15 <= par14;
+		par16 <= par15;
+		par17 <= par16;
+		par18 <= par17;
+
+		// Clocks 3-18: No more changes needed to the Display Enable signal, flow it through the pipeline
+		den3 <= den2;
+		den4 <= den3;
+		den5 <= den4;
+		den6 <= den5;
+		den7 <= den6;
+		den8 <= den7;
+		den9 <= den8;
+		den10 <= den9;
+		den11 <= den10;
+		den12 <= den11;
+		den13 <= den12;
+		den14 <= den13;
+		den15 <= den14;
+		den16 <= den15;
+		den17 <= den16;
+		not_den18 <= ~den17;
+
+		mode3 <= mode2;
+		mode4 <= mode3;
+		mode5 <= mode4;
+		mode6 <= mode5;
+		mode7 <= mode6;
+		mode8 <= mode7;
+		mode9 <= mode8;
+		mode10 <= mode9;
+		mode11 <= mode10;
+		mode12 <= mode11;
+		mode13 <= mode12;
+		mode14 <= mode13;
+		mode15 <= mode14;
+		mode16 <= mode15;
+		mode17 <= mode16;
+		mode18 <= mode17;
+		mode19 <= mode18;
+
+		di_dat1 <= di_dat0;
+		di_dat2 <= di_dat1;
+		di_dat3 <= di_dat2;
+		di_dat4 <= di_dat3;
+		di_dat5 <= di_dat4;
+		di_dat6 <= di_dat5;
+		di_dat7 <= di_dat6;
+		di_dat8 <= di_dat7;
+		di_dat9 <= di_dat8;
+		di_dat10 <= di_dat9;
+		di_dat11 <= di_dat10;
+		di_dat12 <= di_dat11;
+		di_dat13 <= di_dat12;
+		di_dat14 <= di_dat13;
+		di_dat15 <= di_dat14;
+		di_dat16 <= di_dat15;
+		di_dat17 <= di_dat16;
+		di_dat18 <= di_dat17;
+		di_dat19 <= di_dat18;
+
+		// Clocks 3-18: Pipeline ctrl data (hsync & vsync), no changes needed.
+		ctl3 <= ctl2;
+		ctl4 <= ctl3;
+		ctl5 <= ctl4;
+		ctl6 <= ctl5;
+		ctl7 <= ctl6;
+		ctl8 <= ctl7;
+		ctl9 <= ctl8;
+		ctl10 <= ctl9;
+		ctl11 <= ctl10;
+		ctl12 <= ctl11;
+		ctl13 <= ctl12;
+		ctl14 <= ctl13;
+		ctl15 <= ctl14;
+		ctl16 <= ctl15;
+		ctl17 <= ctl16;
+		ctl18 <= ctl17;
+		ctl19 <= ctl18;
+
+		// Clocks 3-9: perform intermediate encoded vector 'enc' of the input 'data' field. At the
+		//             end of the encoding, the DVI spec says the encoded vector should look like
+		//             follows:
+		// enc <= { parity ^ data[0] ^ data[1] ^ data[2] ^ data[3] ^ data[4] ^ data[5] ^ data[6] ^ data[7],
+		//                   data[0] ^ data[1] ^ data[2] ^ data[3] ^ data[4] ^ data[5] ^ data[6],
+		//          parity ^ data[0] ^ data[1] ^ data[2] ^ data[3] ^ data[4] ^ data[5],
+		//                   data[0] ^ data[1] ^ data[2] ^ data[3] ^ data[4],
+		//          parity ^ data[0] ^ data[1] ^ data[2] ^ data[3],
+		//                   data[0] ^ data[1] ^ data[2],
+		//          parity ^ data[0] ^ data[1],
+		//                   data[0] };
+		//
+		// Calculate it across a few clock cycles to avoid high complexity per clock. (ignore parity first)
+		// Bit lanes after each clock cycle:
+		//                [7]     [6]    [5]   [4]  [3] [2] [1] [0]
+		// Clock 2:        7       6      5     4    3   2   1   0
+		// Clock 3:       76      65     54    43   32  21  10   0
+		// Clock 4:     7654    6543   5432  4321 3210 210  10   0
+		// Clock 5: 76543210 6543210 543210 43210 3210 210  10   0
+
+		enc3 <= {dat2[7:1]^dat2[6:0], dat2[  0]};
+		enc4 <= {enc3[7:2]^enc3[5:0], enc3[1:0]};
+		enc5 <= {enc4[7:4]^enc4[3:0], enc4[3:0]};
+		enc6 <= enc5;
+		enc7 <= enc6;
+		enc8 <= enc7;
+
+		// Clock 9: Meanwhile, parity computation has completed, so apply the final parity XOR to the
+		//          intermediate encoded vector.
+		enc9 <= enc8 ^ {4{par8[4], 1'b0}};
+		enc10 <= enc9;
+		enc11 <= enc10;
+		enc12 <= enc11;
+		enc13 <= enc12;
+		enc14 <= enc13;
+		enc15 <= enc14;
+		enc16 <= enc15;
+		enc17 <= enc16;
+		enc18 <= enc17;
+
+		// Clocks 10-17: calculate 'eon' (Encoded ONes vs zeros): a signed count that specifies whether
+		//               vector 'enc' has more ones or zeroes in it.
+		tpa10 <= enc9[3:0] ^ enc9[7:4]; // Fold the 8 bit enc vector into two 4-bit halves, and half-
+		tpa11 <= tpa10;                 // Then calculate the number of ones in them in parallel
+		tpb11 <= enc10[3:0] & enc10[7:4];//tpb10;
+		tpa12 <= tpa11[3] + tpa11[2] + tpa11[1] + tpa11[0]; // Then calculate the number of ones in them in parallel
+		tpb12 <= tpb11[3] + tpb11[2] + tpb11[1] + tpb11[0]; // for SV $countones(tpb11) can be used
+		eon13 <= tpa12 + {tpb12, 1'b0}; // Then use a 3-bit + 4-bit addition to bring the full count.
+		eon14 <= eon13 - 3'd4;          // And make the result signed.
+		eon15 <= eon14;
+		eon16 <= eon15;
+		eon17 <= eon16;
+		eon18 <= eon17;
+
+		// 'eon17' is a count of balance of ones vs zeros in input encoded vector 'enc':
+		//        #ones: 8 7 6 5 4  3  2  1  0
+		// #ones-#zeros: 8 6 4 2 0 -2 -4 -6 -8
+		// value of eon: 4 3 2 1 0 -1 -2 -3 -4
+
+		// Pipeline a few finishing touches:
+		eve18 <= eon17 == 0;                      // is the balance equal (zero)?
+		inv18 <= par17 ? -eon17     : eon17;      // invert balance count based on parity.
+		shr18 <= par17 ? eon17      : eon17-1'b1; // right shift balance count based on parity.
+		shl18 <= par17 ? eon17-1'b1 : eon17;      // left shift balance count based on parity.
+		tmds_blank18 <= {~ctl17[1], 9'b101010100} ^ {10{ctl17[0]}};
+		tmds_even18 <= {par17, ~par17, {8{par17}} ^ enc17};
+		tmds_pos18 <= {1'b1, ~par17, 8'hff ^ enc17};
+		tmds_neg18 <= {1'b0, ~par17,         enc17};
+
+		// Clocks 14-17 above:
+		// These are "empty" filler clock stages that contain no computations on any of the variables,
+		// but they only perform direct passthrough of the values that have been computed so far.
+		// Gowin IDE Analyzer reports that this improves max. timing performance.
+
+		// Clock 18: finally output the TMDS encoded value, and update bias value
+		if (not_den18) begin // In display blank?
+			tmds_video <= tmds_blank18;          // Output control words for hsync and vsync
+			bias <= 0;                           // Bias resets to zero in blank
+		end else if (eve18 || bias == 0) begin       // If current bias is even, or encoded balance is even..
+			tmds_video <= tmds_even18;           // .. use a specific 'even' state TMDS formula.
+			bias <= bias + inv18;                // This does not seem to be strictly necessary, you can try removing this else block for tiny bit more performance.
+		end else if (bias[3] == eon18[3]) begin      // Otherwise, noneven bias and balance, so use the main TMDS encoding formula
+			tmds_video <= tmds_pos18;
+			bias <= bias - shr18;                // and update running bias of ones vs zeros sent.
+		end else begin
+			tmds_video <= tmds_neg18;
+			bias <= bias + shl18;
+		end
+
+		// Clock 19: Apply selected mode.
+		case (mode19)
+			3'd0: tmds <= tmds_video;
+			3'd1: tmds <= tmds_video;
 			3'd2: tmds <= video_guard_band;
 			3'd3: tmds <= terc4_coding;
 			3'd4: tmds <= data_guard_band;