update scaler and RAM

2024-05-11 21:53:35 +08:00
parent f3f01c1c51
commit c96e9b57b2
20 changed files with 1393 additions and 72 deletions
--- a/Scaler/scaler.v
+++ b/Scaler/scaler.v
@@ -0,0 +1,760 @@
+/*-----------------------------------------------------------------------------
+
+								Video Stream Scaler
+								
+							Author: David Kronstein
+							
+
+
+Copyright 2011, David Kronstein, and individual contributors as indicated
+by the @authors tag.
+
+This is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation; either version 2.1 of
+the License, or (at your option) any later version.
+
+This software is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this software; if not, write to the Free
+Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+02110-1301 USA, or see the FSF site: http://www.fsf.org.
+
+								
+-------------------------------------------------------------------------------
+
+Scales streaming video up or down in resolution. Bilinear and nearest neighbor
+modes are supported.
+
+Run-time adjustment of input and output resolution, scaling factors, and scale
+type. 
+
+-------------------------------------------------------------------------------
+
+Revisions
+
+V1.0.0	Feb 21 2011		Initial Release		David Kronstein
+Known bugs:
+Very slight numerical errors (+0/-2 LSb) in output data due to coefficient arithmetic.
+Impossible to notice without adjustment in video levels. Attempted to fix by setting
+coeff11 to 1.0 - other coefficients, but this caused timing issues.
+
+*/
+`default_nettype none
+
+module streamScaler #(
+//---------------------------Parameters----------------------------------------
+parameter	DATA_WIDTH =			8,		//Width of input/output data
+parameter	CHANNELS =				1,		//Number of channels of DATA_WIDTH, for color images
+parameter 	DISCARD_CNT_WIDTH =		8,		//Width of inputDiscardCnt
+parameter	INPUT_X_RES_WIDTH =		11,		//Widths of input/output resolution control signals
+parameter	INPUT_Y_RES_WIDTH =		11,
+parameter	OUTPUT_X_RES_WIDTH =	11,
+parameter	OUTPUT_Y_RES_WIDTH =	11,
+parameter	FRACTION_BITS =			8,		//Number of bits for fractional component of coefficients.
+
+parameter	SCALE_INT_BITS =		4,		//Width of integer component of scaling factor. The maximum input data width to
+											//multipliers created will be SCALE_INT_BITS + SCALE_FRAC_BITS. Typically these
+											//values will sum to 18 to match multipliers available in FPGAs.
+parameter	SCALE_FRAC_BITS =		14,		//Width of fractional component of scaling factor
+parameter	BUFFER_SIZE =			4,		//Depth of RFIFO
+//---------------------Non-user-definable parameters----------------------------
+parameter	COEFF_WIDTH =			FRACTION_BITS + 1,
+parameter	SCALE_BITS =			SCALE_INT_BITS + SCALE_FRAC_BITS,
+parameter	BUFFER_SIZE_WIDTH =		((BUFFER_SIZE+1) <= 2) ? 1 :	//wide enough to hold value BUFFER_SIZE + 1
+									((BUFFER_SIZE+1) <= 4) ? 2 :
+									((BUFFER_SIZE+1) <= 8) ? 3 :
+									((BUFFER_SIZE+1) <= 16) ? 4 :
+									((BUFFER_SIZE+1) <= 32) ? 5 :
+									((BUFFER_SIZE+1) <= 64) ? 6 : 7
+)(
+//---------------------------Module IO-----------------------------------------
+//Clock and reset
+input wire							clk,
+input wire							rst,
+
+//User interface
+//Input
+input wire [DATA_WIDTH*CHANNELS-1:0]dIn,
+input wire							dInValid,
+output wire							nextDin,
+input wire							start,
+
+//Output
+output reg [DATA_WIDTH*CHANNELS-1:0]
+									dOut,
+output reg							dOutValid,			//latency of 4 clock cycles after nextDout is asserted
+input wire							nextDout,
+
+//Control
+input wire [DISCARD_CNT_WIDTH-1:0]	inputDiscardCnt,	//Number of input pixels to discard before processing data. Used for clipping
+input wire [INPUT_X_RES_WIDTH-1:0]	inputXRes,			//Resolution of input data minus 1
+input wire [INPUT_Y_RES_WIDTH-1:0]	inputYRes,
+input wire [OUTPUT_X_RES_WIDTH-1:0]	outputXRes,			//Resolution of output data minus 1
+input wire [OUTPUT_Y_RES_WIDTH-1:0]	outputYRes,
+input wire [SCALE_BITS-1:0]			xScale,				//Scaling factors. Input resolution scaled up by 1/xScale. Format Q SCALE_INT_BITS.SCALE_FRAC_BITS
+input wire [SCALE_BITS-1:0]			yScale,				//Scaling factors. Input resolution scaled up by 1/yScale. Format Q SCALE_INT_BITS.SCALE_FRAC_BITS
+
+input wire [OUTPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:0]
+									leftOffset,			//Integer/fraction of input pixel to offset output data horizontally right. Format Q OUTPUT_X_RES_WIDTH.SCALE_FRAC_BITS
+input wire [SCALE_FRAC_BITS-1:0]	topFracOffset,		//Fraction of input pixel to offset data vertically down. Format Q0.SCALE_FRAC_BITS
+input wire							nearestNeighbor		//Use nearest neighbor resize instead of bilinear
+);
+//-----------------------Internal signals and registers------------------------
+reg								advanceRead1;
+reg								advanceRead2;
+
+wire [DATA_WIDTH*CHANNELS-1:0]	readData00;
+wire [DATA_WIDTH*CHANNELS-1:0]	readData01;
+wire [DATA_WIDTH*CHANNELS-1:0]	readData10;
+wire [DATA_WIDTH*CHANNELS-1:0]	readData11;
+reg [DATA_WIDTH*CHANNELS-1:0]	readData00Reg;
+reg [DATA_WIDTH*CHANNELS-1:0]	readData01Reg;
+reg [DATA_WIDTH*CHANNELS-1:0]	readData10Reg;
+reg [DATA_WIDTH*CHANNELS-1:0]	readData11Reg;
+
+wire [INPUT_X_RES_WIDTH-1:0]	readAddress;
+
+reg 							readyForRead;		//Indicates two full lines have been put into the buffer
+reg [OUTPUT_Y_RES_WIDTH-1:0]	outputLine;			//which output video line we're on
+reg [OUTPUT_X_RES_WIDTH-1:0]	outputColumn;		//which output video column we're on
+reg [INPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:0]
+								xScaleAmount;		//Fractional and integer components of input pixel select (multiply result)
+reg [INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:0]
+								yScaleAmount;		//Fractional and integer components of input pixel select (multiply result)
+reg [INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:0]
+								yScaleAmountNext;	//Fractional and integer components of input pixel select (multiply result)
+wire [BUFFER_SIZE_WIDTH-1:0] 	fillCount;			//Numbers used rams in the ram fifo
+reg                 			lineSwitchOutputDisable; //On the end of an output line, disable the output for one cycle to let the RAM data become valid
+reg								dOutValidInt;
+
+reg [COEFF_WIDTH-1:0]			xBlend;
+wire [COEFF_WIDTH-1:0]			yBlend = {1'b0, yScaleAmount[SCALE_FRAC_BITS-1:SCALE_FRAC_BITS-FRACTION_BITS]};
+
+wire [INPUT_X_RES_WIDTH-1:0]	xPixLow = xScaleAmount[INPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
+wire [INPUT_Y_RES_WIDTH-1:0]	yPixLow = yScaleAmount[INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
+wire [INPUT_Y_RES_WIDTH-1:0]	yPixLowNext = yScaleAmountNext[INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
+
+wire 							allDataWritten;		//Indicates that all data from input has been read in
+reg 							readState;
+
+//States for read state machine
+parameter RS_START = 0;
+parameter RS_READ_LINE = 1;
+
+//Read state machine
+//Controls the RFIFO(ram FIFO) readout and generates output data valid signals
+always @ (posedge clk or posedge rst or posedge start)
+begin
+	if(rst | start)
+	begin
+		outputLine <= 0;
+		outputColumn <= 0;
+		xScaleAmount <= 0;
+		yScaleAmount <= 0;
+		readState <= RS_START;
+		dOutValidInt <= 0;
+		lineSwitchOutputDisable <= 0;
+		advanceRead1 <= 0;
+		advanceRead2 <= 0;
+		yScaleAmountNext <= 0;
+	end
+	else
+	begin
+		case (readState)
+		  
+			RS_START:
+			begin
+				xScaleAmount <= leftOffset;
+				yScaleAmount <= {{INPUT_Y_RES_WIDTH{1'b0}}, topFracOffset};
+				if(readyForRead)
+				begin
+					readState <= RS_READ_LINE;
+					dOutValidInt <= 1;
+				end
+			end
+
+			RS_READ_LINE:
+			begin
+			
+				//outputLine goes through all output lines, and the logic determines which input lines to read into the RRB and which ones to discard.
+				if(nextDout && dOutValidInt)
+				begin
+					if(outputColumn == outputXRes)
+					begin //On the last input pixel of the line
+						if(yPixLowNext == (yPixLow + 1))    //If the next input line is only one greater, advance the RRB by one only
+						begin
+							advanceRead1 <= 1;
+							if(fillCount < 3)		//If the RRB doesn't have enough data, stop reading it out
+								dOutValidInt <= 0;
+						end
+						else if(yPixLowNext > (yPixLow + 1))  //If the next input line is two or more greater, advance the read by two
+						begin
+							advanceRead2 <= 1;
+							if(fillCount < 4)		//If the RRB doesn't have enough data, stop reading it out
+								dOutValidInt <= 0;
+						end
+					
+						outputColumn <= 0;
+						xScaleAmount <= leftOffset;
+						outputLine <= outputLine + 1;
+						yScaleAmount <= yScaleAmountNext;
+						lineSwitchOutputDisable <= 1;
+					end
+					else
+					begin
+						//Advance the output pixel selection values except when waiting for the ram data to become valid
+						if(lineSwitchOutputDisable == 0)
+						begin
+							outputColumn <= outputColumn + 1;
+							xScaleAmount <= (outputColumn + 1) * xScale + leftOffset;
+						end
+						advanceRead1 <= 0;
+						advanceRead2 <= 0;
+						lineSwitchOutputDisable <= 0;
+					end
+				end
+				else //else from if(nextDout && dOutValidInt)
+				begin
+					advanceRead1 <= 0;
+					advanceRead2 <= 0;
+					lineSwitchOutputDisable <= 0;
+				end
+				
+				//Once the RRB has enough data, let data be read from it. If all input data has been written, always allow read
+				if(fillCount >= 2 && dOutValidInt == 0 || allDataWritten)
+				begin
+					if((!advanceRead1 && !advanceRead2))
+					begin
+						dOutValidInt <= 1;
+						lineSwitchOutputDisable <= 0;
+					end
+				end
+			end//state RS_READ_LINE:
+		endcase
+		
+		//yScaleAmountNext is used to determine which input lines are valid.
+		yScaleAmountNext <= (outputLine + 1) * yScale + {{OUTPUT_Y_RES_WIDTH{1'b0}}, topFracOffset};
+	end
+end
+
+assign readAddress = xPixLow;
+
+//Generate dOutValid signal, delayed to account for delays in data path
+reg dOutValid_1;
+reg dOutValid_2;
+reg dOutValid_3;
+
+always @(posedge clk or posedge rst)
+begin
+	if(rst)
+	begin
+		dOutValid_1 <= 0;
+		dOutValid_2 <= 0;
+		dOutValid_3 <= 0;
+		dOutValid <= 0;
+	end
+	else
+	begin
+		dOutValid_1 <= nextDout && dOutValidInt && !lineSwitchOutputDisable;
+		dOutValid_2 <= dOutValid_1;
+		dOutValid_3 <= dOutValid_2;
+		dOutValid <= dOutValid_3;
+	end
+end
+
+//-----------------------Output data generation-----------------------------
+//Scale amount values are used to generate coefficients for the four pixels coming out of the RRB to be multiplied with.
+
+//Coefficients for each of the four pixels
+//Format Q1.FRACTION_BITS
+//			   yx
+reg [COEFF_WIDTH-1:0] 	coeff00;		//Top left
+reg [COEFF_WIDTH-1:0] 	coeff01;		//Top right
+reg [COEFF_WIDTH-1:0]	coeff10;		//Bottom left
+reg [COEFF_WIDTH-1:0]	coeff11;		//Bottom right
+
+//Coefficient value of one, format Q1.COEFF_WIDTH-1
+wire [COEFF_WIDTH-1:0]	coeffOne = {1'b1, {(COEFF_WIDTH-1){1'b0}}};	//One in MSb, zeros elsewhere
+//Coefficient value of one half, format Q1.COEFF_WIDTH-1
+wire [COEFF_WIDTH-1:0]	coeffHalf = {2'b01, {(COEFF_WIDTH-2){1'b0}}};
+
+//Compute bilinear interpolation coefficinets. Done here because these pre-registerd values are used twice.
+//Adding coeffHalf to get the nearest value.
+wire [COEFF_WIDTH-1:0]	preCoeff00 = (((coeffOne - xBlend) * (coeffOne - yBlend) + (coeffHalf - 1)) >> FRACTION_BITS) & 	{{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
+wire [COEFF_WIDTH-1:0]	preCoeff01 = ((xBlend * (coeffOne - yBlend) + (coeffHalf - 1)) >> FRACTION_BITS) & 				{{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
+wire [COEFF_WIDTH-1:0]	preCoeff10 = (((coeffOne - xBlend) * yBlend + (coeffHalf - 1)) >> FRACTION_BITS) &				{{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
+
+//Compute the coefficients
+always @(posedge clk or posedge rst)
+begin
+	if(rst)
+	begin
+		coeff00 <= 0;
+		coeff01 <= 0;
+		coeff10 <= 0;
+		coeff11 <= 0;
+		xBlend <= 0;
+	end
+	else
+	begin
+		xBlend <= {1'b0, xScaleAmount[SCALE_FRAC_BITS-1:SCALE_FRAC_BITS-FRACTION_BITS]};	//Changed to registered to improve timing
+		
+		if(nearestNeighbor == 1'b0)
+		begin
+			//Normal bilinear interpolation
+			coeff00 <= preCoeff00;
+			coeff01 <= preCoeff01;
+			coeff10 <= preCoeff10;
+			coeff11 <= ((xBlend * yBlend + (coeffHalf - 1)) >> FRACTION_BITS) &								{{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
+			//coeff11 <= coeffOne - preCoeff00 - preCoeff01 - preCoeff10;		//Guarantee that all coefficients sum to coeffOne. Saves a multiply too. Reverted to previous method due to timing issues.
+		end
+		else
+		begin
+			//Nearest neighbor interploation, set one coefficient to 1.0, the rest to zero based on the fractions
+			coeff00 <= xBlend < coeffHalf && yBlend < coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
+			coeff01 <= xBlend >= coeffHalf && yBlend < coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
+			coeff10 <= xBlend < coeffHalf && yBlend >= coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
+			coeff11 <= xBlend >= coeffHalf && yBlend >= coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
+		end
+	end
+end
+
+
+//Generate the blending multipliers
+reg [(DATA_WIDTH+COEFF_WIDTH)*CHANNELS-1:0]	product00, product01, product10, product11;
+
+generate
+genvar channel;
+	for(channel = 0; channel < CHANNELS; channel = channel + 1)
+		begin : blend_mult_generate
+			always @(posedge clk or posedge rst)
+			begin
+				if(rst)
+				begin
+					//productxx[channel] <= 0;
+					product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
+					product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
+					product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
+					product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
+					
+					//readDataxxReg[channel] <= 0;
+					readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
+					readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
+					readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
+					readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
+					
+					//dOut[channel] <= 0;
+					dOut[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
+				end
+				else
+				begin
+					//readDataxxReg[channel] <= readDataxx[channel];
+					readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData00[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
+					readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData01[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
+					readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData10[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
+					readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData11[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
+				
+					//productxx[channel] <= readDataxxReg[channel] * coeffxx
+					product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff00;
+					product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff01;
+					product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff10;
+					product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff11;
+					
+					//dOut[channel] <= (((product00[channel]) + 
+					//					(product01[channel]) +
+					//					(product10[channel]) +
+					//					(product11[channel])) >> FRACTION_BITS) & ({ {COEFF_WIDTH{1'b0}}, {DATA_WIDTH{1'b1}} });
+					dOut[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <=
+							(((product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) + 
+							(product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) +
+							(product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) +
+							(product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel])) >> FRACTION_BITS) & ({ {COEFF_WIDTH{1'b0}}, {DATA_WIDTH{1'b1}} });
+				end
+			end
+		end
+endgenerate
+
+				
+//---------------------------Data write logic----------------------------------
+//Places input data into the correct ram in the RFIFO (ram FIFO)
+//Controls writing to the RFIFO, and discards lines that arn't used
+
+reg [INPUT_Y_RES_WIDTH-1:0]		writeNextValidLine;	//Which line greater than writeRowCount is the next one that must be read in
+reg [INPUT_Y_RES_WIDTH-1:0]		writeNextPlusOne;	//One greater than writeNextValidLine, because we must always read in two adjacent lines
+reg [INPUT_Y_RES_WIDTH-1:0]		writeRowCount;		//Which line we're reading from dIn
+reg [OUTPUT_Y_RES_WIDTH-1:0]	writeOutputLine;	//The output line that corresponds to the input line. This is incremented until writeNextValidLine is greater than writeRowCount
+reg								getNextPlusOne;		//Flag so that writeNextPlusOne is captured only once after writeRowCount >= writeNextValidLine. This is in case multiple cycles are requred until writeNextValidLine changes.
+
+//Determine which lines to read out and which to discard.
+//writeNextValidLine is the next valid line number that needs to be read out above current value writeRowCount
+//writeNextPlusOne also needs to be read out (to do interpolation), this may or may not be equal to writeNextValidLine
+always @(posedge clk or posedge rst or posedge start)
+begin
+	if(rst | start)
+	begin
+		writeOutputLine <= 0;
+		writeNextValidLine <= 0;
+		writeNextPlusOne <= 1;
+		getNextPlusOne <= 1;
+	end
+	else
+	begin
+		if(writeRowCount >= writeNextValidLine) //When writeRowCount becomes higher than the next valid line to read out, comptue the next valid line.
+		begin
+			if(getNextPlusOne)			//Keep writeNextPlusOne
+			begin
+				writeNextPlusOne <= writeNextValidLine + 1;
+			end
+			getNextPlusOne <= 0;
+			writeOutputLine <= writeOutputLine + 1;
+			writeNextValidLine <= ((writeOutputLine*yScale + {{(OUTPUT_Y_RES_WIDTH + SCALE_INT_BITS){1'b0}}, topFracOffset}) >> SCALE_FRAC_BITS) & {{SCALE_BITS{1'b0}}, {OUTPUT_Y_RES_WIDTH{1'b1}}};
+		end
+		else
+		begin
+			getNextPlusOne <= 1;
+		end
+	end
+end
+
+reg			discardInput;
+reg [DISCARD_CNT_WIDTH-1:0] discardCountReg;
+wire		advanceWrite;
+
+reg [1:0]	writeState;
+
+reg [INPUT_X_RES_WIDTH-1:0] writeColCount;
+reg			enableNextDin;
+reg			forceRead;
+
+//Write state machine
+//Controls writing scaler input data into the RRB
+
+parameter	WS_START = 0;
+parameter	WS_DISCARD = 1;
+parameter	WS_READ = 2;
+parameter	WS_DONE = 3;
+
+//Control write and address signals to write data into ram FIFO
+always @ (posedge clk or posedge rst or posedge start)
+begin
+	if(rst | start)
+	begin
+		writeState <= WS_START;
+		enableNextDin <= 0;
+		discardInput <= 0;
+		readyForRead <= 0;
+		writeRowCount <= 0;
+		writeColCount <= 0;
+		discardCountReg <= 0;
+		forceRead <= 0;
+	end
+	else
+	begin
+		case (writeState)
+		
+			WS_START:
+			begin
+				discardCountReg <= inputDiscardCnt;
+				if(inputDiscardCnt > 0)
+				begin
+					discardInput <= 1;
+					enableNextDin <= 1;
+					writeState <= WS_DISCARD;
+				end
+				else
+				begin
+					discardInput <= 0;
+					enableNextDin <= 1;
+					writeState <= WS_READ;
+				end
+				discardInput <= (inputDiscardCnt > 0) ? 1'b1 : 1'b0;
+			end
+			
+			WS_DISCARD:	//Discard pixels from input data
+			begin
+				if(dInValid)
+				begin
+					discardCountReg <= discardCountReg - 1;
+					if((discardCountReg - 1) == 0)
+					begin
+						discardInput <= 0;
+						writeState <= WS_READ;
+					end
+				end
+			end
+			
+			WS_READ:
+			begin
+				if(dInValid & nextDin)
+				begin
+					if(writeColCount == inputXRes)
+					begin	//Occurs on the last pixel in the line
+						if((writeNextValidLine == writeRowCount + 1) ||
+							(writeNextPlusOne == writeRowCount + 1))
+						begin //Next line is valid, write into buffer
+							discardInput <= 0;
+						end
+						else
+						begin	//Next line is not valid, discard
+							discardInput <= 1;
+						end
+						
+						//Once writeRowCount is >= 2, data is ready to start being output.
+						if(writeRowCount[1])
+							readyForRead <= 1;
+						
+						if(writeRowCount == inputYRes)	//When all data has been read in, stop reading.
+						begin
+							writeState <= WS_DONE;
+							enableNextDin <= 0;
+							forceRead <= 1;
+						end
+						
+						writeColCount <= 0;
+						writeRowCount <= writeRowCount + 1;
+					end
+					else
+					begin
+						writeColCount <= writeColCount + 1;
+					end
+				end
+			end
+			
+			WS_DONE:
+			begin
+				//do nothing, wait for reset
+			end
+			
+		endcase
+	end
+end
+
+
+//Advance write whenever we have just written a valid line (discardInput == 0)
+//Generate this signal one earlier than discardInput above that uses the same conditions, to advance the buffer at the right time.
+assign advanceWrite =	(writeColCount == inputXRes) & (discardInput == 0) & dInValid & nextDin;
+assign allDataWritten = writeState == WS_DONE;
+assign nextDin = (fillCount < BUFFER_SIZE) & enableNextDin;
+
+ramFifo #(
+	.DATA_WIDTH( DATA_WIDTH*CHANNELS ),
+	.ADDRESS_WIDTH( INPUT_X_RES_WIDTH ),	//Controls width of RAMs
+	.BUFFER_SIZE( BUFFER_SIZE )		//Number of RAMs
+) ramRB (
+	.clk( clk ),
+	.rst( rst | start ),
+	.advanceRead1( advanceRead1 ),
+	.advanceRead2( advanceRead2 ),
+	.advanceWrite( advanceWrite ),
+	.forceRead( forceRead ),
+
+	.writeData( dIn ),		
+	.writeAddress( writeColCount ),
+	.writeEnable( dInValid & nextDin & enableNextDin & ~discardInput ),
+	.fillCount( fillCount ),
+	
+	.readData00( readData00 ),
+	.readData01( readData01 ),
+	.readData10( readData10 ),
+	.readData11( readData11 ),
+	.readAddress( readAddress )
+);
+
+endmodule	//scaler
+
+
+
+//---------------------------Ram FIFO (RFIFO)-----------------------------
+//FIFO buffer with rams as the elements, instead of data
+//One ram is filled, while two others are simultaneously read out.
+//Four neighboring pixels are read out at once, at the selected RAM and one line down, and at readAddress and readAddress + 1
+module ramFifo #(
+	parameter DATA_WIDTH = 8,
+	parameter ADDRESS_WIDTH = 8,
+	parameter BUFFER_SIZE = 2,
+	parameter BUFFER_SIZE_WIDTH =	((BUFFER_SIZE+1) <= 2) ? 1 :	//wide enough to hold value BUFFER_SIZE + 1
+									((BUFFER_SIZE+1) <= 4) ? 2 :
+									((BUFFER_SIZE+1) <= 8) ? 3 :
+									((BUFFER_SIZE+1) <= 16) ? 4 :
+									((BUFFER_SIZE+1) <= 32) ? 5 :
+									((BUFFER_SIZE+1) <= 64) ? 6 : 7
+)(
+	input wire 						clk,
+	input wire 						rst,
+	input wire						advanceRead1,	//Advance selected read RAM by one
+	input wire						advanceRead2,	//Advance selected read RAM by two
+	input wire						advanceWrite,	//Advance selected write RAM by one
+	input wire						forceRead,		//Disables writing to allow all data to be read out (RAM being written to cannot be read from normally)		
+
+	input wire [DATA_WIDTH-1:0]		writeData,
+	input wire [ADDRESS_WIDTH-1:0]	writeAddress,
+	input wire						writeEnable,
+	output reg [BUFFER_SIZE_WIDTH-1:0]
+									fillCount,
+
+	//										yx
+	output wire [DATA_WIDTH-1:0]	readData00,		//Read from deepest RAM (earliest data), at readAddress
+	output wire [DATA_WIDTH-1:0]	readData01,		//Read from deepest RAM (earliest data), at readAddress + 1
+	output wire [DATA_WIDTH-1:0]	readData10,		//Read from second deepest RAM (second earliest data), at readAddress
+	output wire [DATA_WIDTH-1:0]	readData11,		//Read from second deepest RAM (second earliest data), at readAddress + 1
+	input wire [ADDRESS_WIDTH-1:0]	readAddress
+);
+
+reg [BUFFER_SIZE-1:0]		writeSelect;
+reg [BUFFER_SIZE-1:0]		readSelect;
+
+//Read select ring register
+always @(posedge clk or posedge rst)
+begin
+	if(rst)
+		readSelect <= 1;
+	else
+	begin
+		if(advanceRead1)
+		begin
+			readSelect <= {readSelect[BUFFER_SIZE-2 : 0], readSelect[BUFFER_SIZE-1]};
+		end
+		else if(advanceRead2)
+		begin
+			readSelect <= {readSelect[BUFFER_SIZE-3 : 0], readSelect[BUFFER_SIZE-1:BUFFER_SIZE-2]};
+		end
+	end
+end
+
+//Write select ring register
+always @(posedge clk or posedge rst)
+begin
+	if(rst)
+		writeSelect <= 1;
+	else
+	begin
+		if(advanceWrite)
+		begin
+			writeSelect <= {writeSelect[BUFFER_SIZE-2 : 0], writeSelect[BUFFER_SIZE-1]};
+		end
+	end
+end
+
+wire [DATA_WIDTH-1:0] ramDataOutA [2**BUFFER_SIZE-1:0];
+wire [DATA_WIDTH-1:0] ramDataOutB [2**BUFFER_SIZE-1:0];
+
+//Generate to instantiate the RAMs
+generate
+genvar i;
+	for(i = 0; i < BUFFER_SIZE; i = i + 1)
+		begin : ram_generate
+
+			ramDualPort #(
+				.DATA_WIDTH( DATA_WIDTH ),
+				.ADDRESS_WIDTH( ADDRESS_WIDTH )
+			) ram_inst_i(
+				.clk( clk ),
+				
+				//Port A is written to as well as read from. When writing, this port cannot be read from.
+				//As long as the buffer is large enough, this will not cause any problem.
+				.addrA( ((writeSelect[i] == 1'b1) && !forceRead && writeEnable) ? writeAddress : readAddress ),	//&& writeEnable is 
+				//to allow the full buffer to be used. After the buffer is filled, write is advanced, so writeSelect
+				//and readSelect are the same. The full buffer isn't written to, so this allows the read to work properly.
+				.dataA( writeData ),													
+				.weA( ((writeSelect[i] == 1'b1) && !forceRead) ? writeEnable : 1'b0 ),
+				.qA( ramDataOutA[2**i] ),
+				
+				.addrB( readAddress + 1 ),
+				.dataB( 0 ),
+				.weB( 1'b0 ),
+				.qB( ramDataOutB[2**i] )
+			);
+		end
+endgenerate
+
+//Select which ram to read from
+wire [BUFFER_SIZE-1:0]	readSelect0 = readSelect;
+wire [BUFFER_SIZE-1:0]	readSelect1 = (readSelect << 1) | readSelect[BUFFER_SIZE-1];
+
+//Steer the output data to the right ports
+assign readData00 = ramDataOutA[readSelect0];
+assign readData10 = ramDataOutA[readSelect1];
+assign readData01 = ramDataOutB[readSelect0];
+assign readData11 = ramDataOutB[readSelect1];
+
+//Keep track of fill level
+always @(posedge clk or posedge rst)
+begin
+	if(rst)
+	begin
+		fillCount <= 0;
+	end
+	else
+	begin
+		if(advanceWrite)
+		begin
+			if(advanceRead1)
+				fillCount <= fillCount;
+			else if(advanceRead2)
+				fillCount <= fillCount - 1;
+			else
+				fillCount <= fillCount + 1;
+		end
+		else
+		begin
+			if(advanceRead1)
+				fillCount <= fillCount - 1;
+			else if(advanceRead2)
+				fillCount <= fillCount - 2;
+			else
+				fillCount <= fillCount;
+		end
+	end
+end
+
+endmodule //ramFifo
+
+
+//Dual port RAM
+module ramDualPort #(
+	parameter DATA_WIDTH = 8,
+	parameter ADDRESS_WIDTH = 8
+)(
+	input wire [(DATA_WIDTH-1):0] dataA, dataB,
+	input wire [(ADDRESS_WIDTH-1):0] addrA, addrB,
+	input wire weA, weB, clk,
+	output reg [(DATA_WIDTH-1):0] qA, qB
+);
+
+	// Declare the RAM variable
+	reg [DATA_WIDTH-1:0] ram[2**ADDRESS_WIDTH-1:0];
+
+	//Port A
+	always @ (posedge clk)
+	begin
+		if (weA) 
+		begin
+			ram[addrA] <= dataA;
+			qA <= dataA;
+		end
+		else 
+		begin
+			qA <= ram[addrA];
+		end 
+	end 
+
+	//Port B
+	always @ (posedge clk)
+	begin
+		if (weB) 
+		begin
+			ram[addrB] <= dataB;
+			qB <= dataB;
+		end
+		else 
+		begin
+			qB <= ram[addrB];
+		end 
+	end
+
+endmodule //ramDualPort
--- a/Scaler/sim/out/output1280x1024to640x512.raw
+++ b/Scaler/sim/out/output1280x1024to640x512.raw
--- a/Scaler/sim/out/output640x512to1280x1024.raw
+++ b/Scaler/sim/out/output640x512to1280x1024.raw
--- a/Scaler/sim/out/output640x512to640x512.raw
+++ b/Scaler/sim/out/output640x512to640x512.raw
--- a/Scaler/sim/scaler_tb.v
+++ b/Scaler/sim/scaler_tb.v
@@ -0,0 +1,437 @@
+/*-----------------------------------------------------------------------------
+
+								Video Stream Scaler testbench
+								
+							Author: David Kronstein
+							
+
+
+Copyright 2011, David Kronstein, and individual contributors as indicated
+by the @authors tag.
+
+This is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as
+published by the Free Software Foundation; either version 2.1 of
+the License, or (at your option) any later version.
+
+This software is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this software; if not, write to the Free
+Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+02110-1301 USA, or see the FSF site: http://www.fsf.org.
+
+-------------------------------------------------------------------------------
+
+Testbench for streamScaler V1.0.0
+
+*/
+
+`default_nettype none
+
+//Input files. Raw data format, no header. 8 bits per pixel, 3 color channels.
+`define INPUT640x512			"src/input640x512RGB.raw"
+`define INPUT1280x1024			"src/input1280x1024RGB.raw"
+`define INPUT1280x1024_21EXTRA	"src/input640x512_21extraRGB.raw"	//21 extra pixels at the start to be discarded
+
+module scalerTestbench;
+parameter BUFFER_SIZE = 4;
+
+wire [7-1:0] done;
+
+//640x512 to 1280x1024
+	scalerTest #(
+	.INPUT_X_RES ( 640-1 ),
+	.INPUT_Y_RES ( 512-1 ),
+	.OUTPUT_X_RES ( 1280-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 1024-1 ),   //Output resolution - 1
+	//.X_SCALE ( X_SCALE ),
+	//.Y_SCALE ( Y_SCALE ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_640x512to1280x1024 (
+	.inputFilename( `INPUT640x512 ),
+	.outputFilename( "out/output640x512to1280x1024.raw" ),
+
+	//Control
+	.inputDiscardCnt( 0 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 0 ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+	.done ( done[0] )
+	);
+
+
+//640x512 to 640x512
+	scalerTest #(
+	.INPUT_X_RES ( 640-1 ),
+	.INPUT_Y_RES ( 512-1 ),
+	.OUTPUT_X_RES ( 640-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 512-1 ),   //Output resolution - 1
+	.X_SCALE ( 32'h4000 ),
+	.Y_SCALE ( 32'h4000 ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_640x512to640x512 (
+	.inputFilename( `INPUT640x512 ),
+	.outputFilename( "out/output640x512to640x512.raw" ),
+
+	//Control
+	.inputDiscardCnt( 0 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 0 ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+	.done ( done[1] )
+	);
+	
+
+//1280x1024 to 960x768
+	scalerTest #(
+	.INPUT_X_RES ( 1280-1 ),
+	.INPUT_Y_RES ( 1024-1 ),
+	.OUTPUT_X_RES ( 960-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 768-1 ),   //Output resolution - 1
+	//.X_SCALE ( X_SCALE ),
+	//.Y_SCALE ( Y_SCALE ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_1280x1024to960x768 (
+	.inputFilename( `INPUT1280x1024 ),
+	.outputFilename( "out/output1280x1024to960x768.raw" ),
+
+	//Control
+	.inputDiscardCnt( 0 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 0 ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+  	.done ( done[2] )
+	);
+
+
+//1280x1024 to 640x512
+	scalerTest #(
+	.INPUT_X_RES ( 1280-1 ),
+	.INPUT_Y_RES ( 1024-1 ),
+	.OUTPUT_X_RES ( 640-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 512-1 ),   //Output resolution - 1
+	.X_SCALE ( 32'h4000*2 ),
+	.Y_SCALE ( 32'h4000*2 ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_1280x1024to640x512 (
+	.inputFilename( `INPUT1280x1024 ),
+	.outputFilename( "out/output1280x1024to640x512.raw" ),
+
+	//Control
+	.inputDiscardCnt( 0 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 25'h1FFF ),
+	.topFracOffset( 25'h1FFF ),
+	.nearestNeighbor( 0 ),
+	.done ( done[3] )
+	);
+
+//1280x1024 to 480x384
+
+	scalerTest #(
+	.INPUT_X_RES ( 1280-1 ),
+	.INPUT_Y_RES ( 1024-1 ),
+	.OUTPUT_X_RES ( 480-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 384-1 ),   //Output resolution - 1
+	//.X_SCALE ( 32'h4000*2 ),
+	//.Y_SCALE ( 32'h4000*2 ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_1280x1024to480x384 (
+	.inputFilename( `INPUT1280x1024 ),
+	.outputFilename( "out/output1280x1024to480x384.raw" ),
+
+	//Control
+	.inputDiscardCnt( 0 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 0 ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+	.done ( done[4] )
+		);
+	
+//640x512 to 1280x1024, discarding 21
+
+	scalerTest #(
+	.INPUT_X_RES ( 640-1 ),
+	.INPUT_Y_RES ( 512-1 ),
+	.OUTPUT_X_RES ( 1280-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 1024-1 ),   //Output resolution - 1
+	//.X_SCALE ( 32'h4000*2 ),
+	//.Y_SCALE ( 32'h4000*2 ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 8 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_640x512to1280x1024_21extra (
+	.inputFilename( `INPUT1280x1024_21EXTRA ),
+	.outputFilename( "out/output640x512to1280x1024_21extra.raw" ),
+
+	//Control
+	.inputDiscardCnt( 21 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( 0 ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+	.done ( done[5] )
+		);
+	
+//640x512 to 1280x1024, discarding 21
+
+	scalerTest #(
+	.INPUT_X_RES ( 640-1 ),
+	.INPUT_Y_RES ( 40-1 ),
+	.OUTPUT_X_RES ( 640-1 ),   //Output resolution - 1
+	.OUTPUT_Y_RES ( 512-1 ),   //Output resolution - 1
+	.X_SCALE ( 32'h4000 * (50-1) / (640-1)-1 ),
+	.Y_SCALE ( 32'h4000 * (40-1) / (512-1)-1 ),
+
+	.DATA_WIDTH ( 8 ),
+	.DISCARD_CNT_WIDTH ( 14 ),
+	.INPUT_X_RES_WIDTH ( 11 ),
+	.INPUT_Y_RES_WIDTH ( 11 ),
+	.OUTPUT_X_RES_WIDTH ( 11 ),
+	.OUTPUT_Y_RES_WIDTH ( 11 ),
+	.BUFFER_SIZE ( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+	) st_50x40to640x512clipped (
+	.inputFilename( `INPUT640x512 ),
+	.outputFilename( "out/output50x40to640x512clipped.raw" ),
+
+	//Control
+	.inputDiscardCnt( 640*3 ),		//Number of input pixels to discard before processing data. Used for clipping
+	.leftOffset( {11'd249, 14'b0} ),
+	.topFracOffset( 0 ),
+	.nearestNeighbor( 0 ),
+	.done ( done[6] )
+		);
+	
+	initial
+	begin
+	  #10
+	  while(done != 7'b1111111)
+	   #10
+	   ;
+		$stop;
+	end
+  
+
+
+
+endmodule
+
+module scalerTest #(
+parameter INPUT_X_RES = 120-1,
+parameter INPUT_Y_RES = 90-1,
+parameter OUTPUT_X_RES = 1280-1,   //Output resolution - 1
+parameter OUTPUT_Y_RES = 960-1,   //Output resolution - 1
+parameter X_SCALE = 32'h4000 * (INPUT_X_RES) / (OUTPUT_X_RES)-1,
+parameter Y_SCALE = 32'h4000 * (INPUT_Y_RES) / (OUTPUT_Y_RES)-1,
+
+parameter DATA_WIDTH = 8,
+parameter CHANNELS = 3,
+parameter DISCARD_CNT_WIDTH = 8,
+parameter INPUT_X_RES_WIDTH = 11,
+parameter INPUT_Y_RES_WIDTH = 11,
+parameter OUTPUT_X_RES_WIDTH = 11,
+parameter OUTPUT_Y_RES_WIDTH = 11,
+parameter BUFFER_SIZE = 6				//Number of RAMs in RAM ring buffer
+)(
+input wire [50*8:0] inputFilename, outputFilename,
+
+//Control
+input wire [DISCARD_CNT_WIDTH-1:0]	inputDiscardCnt,		//Number of input pixels to discard before processing data. Used for clipping
+input wire [INPUT_X_RES_WIDTH+14-1:0] leftOffset,
+input wire [14-1:0]	topFracOffset,
+input wire nearestNeighbor,
+
+output reg done
+
+);
+
+
+reg clk;
+reg rst;
+
+
+reg [DATA_WIDTH*CHANNELS-1:0] dIn;
+reg		dInValid;
+wire	nextDin;
+reg		start;
+
+wire [DATA_WIDTH*CHANNELS-1:0] dOut;
+wire	dOutValid;
+reg		nextDout;
+
+integer r, rfile, wfile;
+
+initial // Clock generator
+  begin
+    #10 //Delay to allow filename to get here
+    clk = 0;
+    #5 forever #5 clk = !clk;
+  end
+
+initial	// Reset
+  begin
+	done = 0;
+    #10 //Delay to allow filename to get here
+    rst = 0;
+    #5 rst = 1;
+    #4 rst = 0;
+   // #50000 $stop;
+  end
+
+reg eof;
+reg [DATA_WIDTH*CHANNELS-1:0] readMem [0:0];
+initial // Input file read, generates dIn data
+begin
+  #10 //Delay to allow filename to get here
+	rfile = $fopen(inputFilename, "rb");
+	
+	dIn = 0;
+	dInValid = 0;
+	start = 0;
+
+	#41
+	start = 1;
+
+	#10
+	start = 0;
+
+	#20
+	r = $fread(readMem, rfile);
+	dIn = readMem[0];
+	
+	while(! $feof(rfile))
+	begin
+		dInValid = 1;
+		
+		#10 
+		if(nextDin) 
+		begin
+			r = $fread(readMem, rfile);
+			dIn = readMem[0];
+		end
+	end
+
+  $fclose(rfile);
+end
+
+//Generate nextDout request signal
+initial
+begin
+  #10 //Delay to match filename arrival delay
+	nextDout = 0;
+	#140001
+	forever
+	begin
+		//This can be used to slow down the read to simulate live read-out. This basically inserts H blank periods.
+		#(10*(OUTPUT_X_RES+1)*4)
+		nextDout = 0;
+		#(10*(OUTPUT_X_RES+1))
+		nextDout = 1;
+		
+	end
+end
+
+//Read dOut and write to file
+integer dOutCount;
+initial
+begin
+  #10 //Delay to allow filename to get here
+	wfile = $fopen(outputFilename, "wb");
+	nextDout = 0;
+	dOutCount = 0;
+	#1
+	while(dOutCount < (OUTPUT_X_RES+1) * (OUTPUT_Y_RES+1))
+	begin
+		#10
+		if(dOutValid == 1)
+		begin
+			$fwrite(wfile, "%c", dOut[23:16]);
+			$fwrite(wfile, "%c", dOut[15:8]);
+			$fwrite(wfile, "%c", dOut[7:0]);
+			dOutCount = dOutCount + 1;
+		end
+	end
+	$fclose(wfile);
+	done = 1;
+end
+
+streamScaler #(
+.DATA_WIDTH( DATA_WIDTH ),
+.CHANNELS( CHANNELS ),
+.DISCARD_CNT_WIDTH( DISCARD_CNT_WIDTH ),
+.INPUT_X_RES_WIDTH( INPUT_X_RES_WIDTH ),
+.INPUT_Y_RES_WIDTH( INPUT_Y_RES_WIDTH ),
+.OUTPUT_X_RES_WIDTH( OUTPUT_X_RES_WIDTH ),
+.OUTPUT_Y_RES_WIDTH( OUTPUT_Y_RES_WIDTH ),
+.BUFFER_SIZE( BUFFER_SIZE )				//Number of RAMs in RAM ring buffer
+) scaler_inst (
+.clk( clk ),
+.rst( rst ),
+
+.dIn( dIn ),
+.dInValid( dInValid ),
+.nextDin( nextDin ),
+.start( start ),
+
+.dOut( dOut ),
+.dOutValid( dOutValid ),
+.nextDout( nextDout ),
+
+//Control
+.inputDiscardCnt( inputDiscardCnt ),		//Number of input pixels to discard before processing data. Used for clipping
+.inputXRes( INPUT_X_RES ),				//Input data number of pixels per line
+.inputYRes( INPUT_Y_RES ),
+
+.outputXRes( OUTPUT_X_RES ),				//Resolution of output data
+.outputYRes( OUTPUT_Y_RES ),
+.xScale( X_SCALE ),					//Scaling factors. Input resolution scaled by 1/xScale. Format Q4.14
+.yScale( Y_SCALE ),					//Scaling factors. Input resolution scaled by 1/yScale. Format Q4.14
+
+.leftOffset( leftOffset ),
+.topFracOffset( topFracOffset ),
+.nearestNeighbor( nearestNeighbor )
+);
+
+endmodule