761 lines
26 KiB
Verilog
761 lines
26 KiB
Verilog
/*-----------------------------------------------------------------------------
|
|
|
|
Video Stream Scaler
|
|
|
|
Author: David Kronstein
|
|
|
|
|
|
|
|
Copyright 2011, David Kronstein, and individual contributors as indicated
|
|
by the @authors tag.
|
|
|
|
This is free software; you can redistribute it and/or modify it
|
|
under the terms of the GNU Lesser General Public License as
|
|
published by the Free Software Foundation; either version 2.1 of
|
|
the License, or (at your option) any later version.
|
|
|
|
This software is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with this software; if not, write to the Free
|
|
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
02110-1301 USA, or see the FSF site: http://www.fsf.org.
|
|
|
|
|
|
-------------------------------------------------------------------------------
|
|
|
|
Scales streaming video up or down in resolution. Bilinear and nearest neighbor
|
|
modes are supported.
|
|
|
|
Run-time adjustment of input and output resolution, scaling factors, and scale
|
|
type.
|
|
|
|
-------------------------------------------------------------------------------
|
|
|
|
Revisions
|
|
|
|
V1.0.0 Feb 21 2011 Initial Release David Kronstein
|
|
Known bugs:
|
|
Very slight numerical errors (+0/-2 LSb) in output data due to coefficient arithmetic.
|
|
Impossible to notice without adjustment in video levels. Attempted to fix by setting
|
|
coeff11 to 1.0 - other coefficients, but this caused timing issues.
|
|
|
|
*/
|
|
`default_nettype none
|
|
|
|
module streamScaler #(
|
|
//---------------------------Parameters----------------------------------------
|
|
parameter DATA_WIDTH = 8, //Width of input/output data
|
|
parameter CHANNELS = 1, //Number of channels of DATA_WIDTH, for color images
|
|
parameter DISCARD_CNT_WIDTH = 8, //Width of inputDiscardCnt
|
|
parameter INPUT_X_RES_WIDTH = 11, //Widths of input/output resolution control signals
|
|
parameter INPUT_Y_RES_WIDTH = 11,
|
|
parameter OUTPUT_X_RES_WIDTH = 11,
|
|
parameter OUTPUT_Y_RES_WIDTH = 11,
|
|
parameter FRACTION_BITS = 8, //Number of bits for fractional component of coefficients.
|
|
|
|
parameter SCALE_INT_BITS = 4, //Width of integer component of scaling factor. The maximum input data width to
|
|
//multipliers created will be SCALE_INT_BITS + SCALE_FRAC_BITS. Typically these
|
|
//values will sum to 18 to match multipliers available in FPGAs.
|
|
parameter SCALE_FRAC_BITS = 14, //Width of fractional component of scaling factor
|
|
parameter BUFFER_SIZE = 4, //Depth of RFIFO
|
|
//---------------------Non-user-definable parameters----------------------------
|
|
parameter COEFF_WIDTH = FRACTION_BITS + 1,
|
|
parameter SCALE_BITS = SCALE_INT_BITS + SCALE_FRAC_BITS,
|
|
parameter BUFFER_SIZE_WIDTH = ((BUFFER_SIZE+1) <= 2) ? 1 : //wide enough to hold value BUFFER_SIZE + 1
|
|
((BUFFER_SIZE+1) <= 4) ? 2 :
|
|
((BUFFER_SIZE+1) <= 8) ? 3 :
|
|
((BUFFER_SIZE+1) <= 16) ? 4 :
|
|
((BUFFER_SIZE+1) <= 32) ? 5 :
|
|
((BUFFER_SIZE+1) <= 64) ? 6 : 7
|
|
)(
|
|
//---------------------------Module IO-----------------------------------------
|
|
//Clock and reset
|
|
input wire clk,
|
|
input wire rst,
|
|
|
|
//User interface
|
|
//Input
|
|
input wire [DATA_WIDTH*CHANNELS-1:0]dIn,
|
|
input wire dInValid,
|
|
output wire nextDin,
|
|
input wire start,
|
|
|
|
//Output
|
|
output reg [DATA_WIDTH*CHANNELS-1:0]
|
|
dOut,
|
|
output reg dOutValid, //latency of 4 clock cycles after nextDout is asserted
|
|
input wire nextDout,
|
|
|
|
//Control
|
|
input wire [DISCARD_CNT_WIDTH-1:0] inputDiscardCnt, //Number of input pixels to discard before processing data. Used for clipping
|
|
input wire [INPUT_X_RES_WIDTH-1:0] inputXRes, //Resolution of input data minus 1
|
|
input wire [INPUT_Y_RES_WIDTH-1:0] inputYRes,
|
|
input wire [OUTPUT_X_RES_WIDTH-1:0] outputXRes, //Resolution of output data minus 1
|
|
input wire [OUTPUT_Y_RES_WIDTH-1:0] outputYRes,
|
|
input wire [SCALE_BITS-1:0] xScale, //Scaling factors. Input resolution scaled up by 1/xScale. Format Q SCALE_INT_BITS.SCALE_FRAC_BITS
|
|
input wire [SCALE_BITS-1:0] yScale, //Scaling factors. Input resolution scaled up by 1/yScale. Format Q SCALE_INT_BITS.SCALE_FRAC_BITS
|
|
|
|
input wire [OUTPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:0]
|
|
leftOffset, //Integer/fraction of input pixel to offset output data horizontally right. Format Q OUTPUT_X_RES_WIDTH.SCALE_FRAC_BITS
|
|
input wire [SCALE_FRAC_BITS-1:0] topFracOffset, //Fraction of input pixel to offset data vertically down. Format Q0.SCALE_FRAC_BITS
|
|
input wire nearestNeighbor //Use nearest neighbor resize instead of bilinear
|
|
);
|
|
//-----------------------Internal signals and registers------------------------
|
|
reg advanceRead1;
|
|
reg advanceRead2;
|
|
|
|
wire [DATA_WIDTH*CHANNELS-1:0] readData00;
|
|
wire [DATA_WIDTH*CHANNELS-1:0] readData01;
|
|
wire [DATA_WIDTH*CHANNELS-1:0] readData10;
|
|
wire [DATA_WIDTH*CHANNELS-1:0] readData11;
|
|
reg [DATA_WIDTH*CHANNELS-1:0] readData00Reg;
|
|
reg [DATA_WIDTH*CHANNELS-1:0] readData01Reg;
|
|
reg [DATA_WIDTH*CHANNELS-1:0] readData10Reg;
|
|
reg [DATA_WIDTH*CHANNELS-1:0] readData11Reg;
|
|
|
|
wire [INPUT_X_RES_WIDTH-1:0] readAddress;
|
|
|
|
reg readyForRead; //Indicates two full lines have been put into the buffer
|
|
reg [OUTPUT_Y_RES_WIDTH-1:0] outputLine; //which output video line we're on
|
|
reg [OUTPUT_X_RES_WIDTH-1:0] outputColumn; //which output video column we're on
|
|
reg [INPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:0]
|
|
xScaleAmount; //Fractional and integer components of input pixel select (multiply result)
|
|
reg [INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:0]
|
|
yScaleAmount; //Fractional and integer components of input pixel select (multiply result)
|
|
reg [INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:0]
|
|
yScaleAmountNext; //Fractional and integer components of input pixel select (multiply result)
|
|
wire [BUFFER_SIZE_WIDTH-1:0] fillCount; //Numbers used rams in the ram fifo
|
|
reg lineSwitchOutputDisable; //On the end of an output line, disable the output for one cycle to let the RAM data become valid
|
|
reg dOutValidInt;
|
|
|
|
reg [COEFF_WIDTH-1:0] xBlend;
|
|
wire [COEFF_WIDTH-1:0] yBlend = {1'b0, yScaleAmount[SCALE_FRAC_BITS-1:SCALE_FRAC_BITS-FRACTION_BITS]};
|
|
|
|
wire [INPUT_X_RES_WIDTH-1:0] xPixLow = xScaleAmount[INPUT_X_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
|
|
wire [INPUT_Y_RES_WIDTH-1:0] yPixLow = yScaleAmount[INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
|
|
wire [INPUT_Y_RES_WIDTH-1:0] yPixLowNext = yScaleAmountNext[INPUT_Y_RES_WIDTH-1+SCALE_FRAC_BITS:SCALE_FRAC_BITS];
|
|
|
|
wire allDataWritten; //Indicates that all data from input has been read in
|
|
reg readState;
|
|
|
|
//States for read state machine
|
|
parameter RS_START = 0;
|
|
parameter RS_READ_LINE = 1;
|
|
|
|
//Read state machine
|
|
//Controls the RFIFO(ram FIFO) readout and generates output data valid signals
|
|
always @ (posedge clk or posedge rst or posedge start)
|
|
begin
|
|
if(rst | start)
|
|
begin
|
|
outputLine <= 0;
|
|
outputColumn <= 0;
|
|
xScaleAmount <= 0;
|
|
yScaleAmount <= 0;
|
|
readState <= RS_START;
|
|
dOutValidInt <= 0;
|
|
lineSwitchOutputDisable <= 0;
|
|
advanceRead1 <= 0;
|
|
advanceRead2 <= 0;
|
|
yScaleAmountNext <= 0;
|
|
end
|
|
else
|
|
begin
|
|
case (readState)
|
|
|
|
RS_START:
|
|
begin
|
|
xScaleAmount <= leftOffset;
|
|
yScaleAmount <= {{INPUT_Y_RES_WIDTH{1'b0}}, topFracOffset};
|
|
if(readyForRead)
|
|
begin
|
|
readState <= RS_READ_LINE;
|
|
dOutValidInt <= 1;
|
|
end
|
|
end
|
|
|
|
RS_READ_LINE:
|
|
begin
|
|
|
|
//outputLine goes through all output lines, and the logic determines which input lines to read into the RRB and which ones to discard.
|
|
if(nextDout && dOutValidInt)
|
|
begin
|
|
if(outputColumn == outputXRes)
|
|
begin //On the last input pixel of the line
|
|
if(yPixLowNext == (yPixLow + 1)) //If the next input line is only one greater, advance the RRB by one only
|
|
begin
|
|
advanceRead1 <= 1;
|
|
if(fillCount < 3) //If the RRB doesn't have enough data, stop reading it out
|
|
dOutValidInt <= 0;
|
|
end
|
|
else if(yPixLowNext > (yPixLow + 1)) //If the next input line is two or more greater, advance the read by two
|
|
begin
|
|
advanceRead2 <= 1;
|
|
if(fillCount < 4) //If the RRB doesn't have enough data, stop reading it out
|
|
dOutValidInt <= 0;
|
|
end
|
|
|
|
outputColumn <= 0;
|
|
xScaleAmount <= leftOffset;
|
|
outputLine <= outputLine + 1;
|
|
yScaleAmount <= yScaleAmountNext;
|
|
lineSwitchOutputDisable <= 1;
|
|
end
|
|
else
|
|
begin
|
|
//Advance the output pixel selection values except when waiting for the ram data to become valid
|
|
if(lineSwitchOutputDisable == 0)
|
|
begin
|
|
outputColumn <= outputColumn + 1;
|
|
xScaleAmount <= (outputColumn + 1) * xScale + leftOffset;
|
|
end
|
|
advanceRead1 <= 0;
|
|
advanceRead2 <= 0;
|
|
lineSwitchOutputDisable <= 0;
|
|
end
|
|
end
|
|
else //else from if(nextDout && dOutValidInt)
|
|
begin
|
|
advanceRead1 <= 0;
|
|
advanceRead2 <= 0;
|
|
lineSwitchOutputDisable <= 0;
|
|
end
|
|
|
|
//Once the RRB has enough data, let data be read from it. If all input data has been written, always allow read
|
|
if(fillCount >= 2 && dOutValidInt == 0 || allDataWritten)
|
|
begin
|
|
if((!advanceRead1 && !advanceRead2))
|
|
begin
|
|
dOutValidInt <= 1;
|
|
lineSwitchOutputDisable <= 0;
|
|
end
|
|
end
|
|
end//state RS_READ_LINE:
|
|
endcase
|
|
|
|
//yScaleAmountNext is used to determine which input lines are valid.
|
|
yScaleAmountNext <= (outputLine + 1) * yScale + {{OUTPUT_Y_RES_WIDTH{1'b0}}, topFracOffset};
|
|
end
|
|
end
|
|
|
|
assign readAddress = xPixLow;
|
|
|
|
//Generate dOutValid signal, delayed to account for delays in data path
|
|
reg dOutValid_1;
|
|
reg dOutValid_2;
|
|
reg dOutValid_3;
|
|
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
begin
|
|
dOutValid_1 <= 0;
|
|
dOutValid_2 <= 0;
|
|
dOutValid_3 <= 0;
|
|
dOutValid <= 0;
|
|
end
|
|
else
|
|
begin
|
|
dOutValid_1 <= nextDout && dOutValidInt && !lineSwitchOutputDisable;
|
|
dOutValid_2 <= dOutValid_1;
|
|
dOutValid_3 <= dOutValid_2;
|
|
dOutValid <= dOutValid_3;
|
|
end
|
|
end
|
|
|
|
//-----------------------Output data generation-----------------------------
|
|
//Scale amount values are used to generate coefficients for the four pixels coming out of the RRB to be multiplied with.
|
|
|
|
//Coefficients for each of the four pixels
|
|
//Format Q1.FRACTION_BITS
|
|
// yx
|
|
reg [COEFF_WIDTH-1:0] coeff00; //Top left
|
|
reg [COEFF_WIDTH-1:0] coeff01; //Top right
|
|
reg [COEFF_WIDTH-1:0] coeff10; //Bottom left
|
|
reg [COEFF_WIDTH-1:0] coeff11; //Bottom right
|
|
|
|
//Coefficient value of one, format Q1.COEFF_WIDTH-1
|
|
wire [COEFF_WIDTH-1:0] coeffOne = {1'b1, {(COEFF_WIDTH-1){1'b0}}}; //One in MSb, zeros elsewhere
|
|
//Coefficient value of one half, format Q1.COEFF_WIDTH-1
|
|
wire [COEFF_WIDTH-1:0] coeffHalf = {2'b01, {(COEFF_WIDTH-2){1'b0}}};
|
|
|
|
//Compute bilinear interpolation coefficinets. Done here because these pre-registerd values are used twice.
|
|
//Adding coeffHalf to get the nearest value.
|
|
wire [COEFF_WIDTH-1:0] preCoeff00 = (((coeffOne - xBlend) * (coeffOne - yBlend) + (coeffHalf - 1)) >> FRACTION_BITS) & {{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
|
|
wire [COEFF_WIDTH-1:0] preCoeff01 = ((xBlend * (coeffOne - yBlend) + (coeffHalf - 1)) >> FRACTION_BITS) & {{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
|
|
wire [COEFF_WIDTH-1:0] preCoeff10 = (((coeffOne - xBlend) * yBlend + (coeffHalf - 1)) >> FRACTION_BITS) & {{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
|
|
|
|
//Compute the coefficients
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
begin
|
|
coeff00 <= 0;
|
|
coeff01 <= 0;
|
|
coeff10 <= 0;
|
|
coeff11 <= 0;
|
|
xBlend <= 0;
|
|
end
|
|
else
|
|
begin
|
|
xBlend <= {1'b0, xScaleAmount[SCALE_FRAC_BITS-1:SCALE_FRAC_BITS-FRACTION_BITS]}; //Changed to registered to improve timing
|
|
|
|
if(nearestNeighbor == 1'b0)
|
|
begin
|
|
//Normal bilinear interpolation
|
|
coeff00 <= preCoeff00;
|
|
coeff01 <= preCoeff01;
|
|
coeff10 <= preCoeff10;
|
|
coeff11 <= ((xBlend * yBlend + (coeffHalf - 1)) >> FRACTION_BITS) & {{COEFF_WIDTH{1'b0}}, {COEFF_WIDTH{1'b1}}};
|
|
//coeff11 <= coeffOne - preCoeff00 - preCoeff01 - preCoeff10; //Guarantee that all coefficients sum to coeffOne. Saves a multiply too. Reverted to previous method due to timing issues.
|
|
end
|
|
else
|
|
begin
|
|
//Nearest neighbor interploation, set one coefficient to 1.0, the rest to zero based on the fractions
|
|
coeff00 <= xBlend < coeffHalf && yBlend < coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
|
|
coeff01 <= xBlend >= coeffHalf && yBlend < coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
|
|
coeff10 <= xBlend < coeffHalf && yBlend >= coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
|
|
coeff11 <= xBlend >= coeffHalf && yBlend >= coeffHalf ? coeffOne : {COEFF_WIDTH{1'b0}};
|
|
end
|
|
end
|
|
end
|
|
|
|
|
|
//Generate the blending multipliers
|
|
reg [(DATA_WIDTH+COEFF_WIDTH)*CHANNELS-1:0] product00, product01, product10, product11;
|
|
|
|
generate
|
|
genvar channel;
|
|
for(channel = 0; channel < CHANNELS; channel = channel + 1)
|
|
begin : blend_mult_generate
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
begin
|
|
//productxx[channel] <= 0;
|
|
product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
|
|
product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
|
|
product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
|
|
product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= 0;
|
|
|
|
//readDataxxReg[channel] <= 0;
|
|
readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
|
|
readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
|
|
readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
|
|
readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
|
|
|
|
//dOut[channel] <= 0;
|
|
dOut[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= 0;
|
|
end
|
|
else
|
|
begin
|
|
//readDataxxReg[channel] <= readDataxx[channel];
|
|
readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData00[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
|
|
readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData01[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
|
|
readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData10[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
|
|
readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <= readData11[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ];
|
|
|
|
//productxx[channel] <= readDataxxReg[channel] * coeffxx
|
|
product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData00Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff00;
|
|
product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData01Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff01;
|
|
product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData10Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff10;
|
|
product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel] <= readData11Reg[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] * coeff11;
|
|
|
|
//dOut[channel] <= (((product00[channel]) +
|
|
// (product01[channel]) +
|
|
// (product10[channel]) +
|
|
// (product11[channel])) >> FRACTION_BITS) & ({ {COEFF_WIDTH{1'b0}}, {DATA_WIDTH{1'b1}} });
|
|
dOut[ DATA_WIDTH*(channel+1)-1 : DATA_WIDTH*channel ] <=
|
|
(((product00[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) +
|
|
(product01[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) +
|
|
(product10[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel]) +
|
|
(product11[ (DATA_WIDTH+COEFF_WIDTH)*(channel+1)-1 : (DATA_WIDTH+COEFF_WIDTH)*channel])) >> FRACTION_BITS) & ({ {COEFF_WIDTH{1'b0}}, {DATA_WIDTH{1'b1}} });
|
|
end
|
|
end
|
|
end
|
|
endgenerate
|
|
|
|
|
|
//---------------------------Data write logic----------------------------------
|
|
//Places input data into the correct ram in the RFIFO (ram FIFO)
|
|
//Controls writing to the RFIFO, and discards lines that arn't used
|
|
|
|
reg [INPUT_Y_RES_WIDTH-1:0] writeNextValidLine; //Which line greater than writeRowCount is the next one that must be read in
|
|
reg [INPUT_Y_RES_WIDTH-1:0] writeNextPlusOne; //One greater than writeNextValidLine, because we must always read in two adjacent lines
|
|
reg [INPUT_Y_RES_WIDTH-1:0] writeRowCount; //Which line we're reading from dIn
|
|
reg [OUTPUT_Y_RES_WIDTH-1:0] writeOutputLine; //The output line that corresponds to the input line. This is incremented until writeNextValidLine is greater than writeRowCount
|
|
reg getNextPlusOne; //Flag so that writeNextPlusOne is captured only once after writeRowCount >= writeNextValidLine. This is in case multiple cycles are requred until writeNextValidLine changes.
|
|
|
|
//Determine which lines to read out and which to discard.
|
|
//writeNextValidLine is the next valid line number that needs to be read out above current value writeRowCount
|
|
//writeNextPlusOne also needs to be read out (to do interpolation), this may or may not be equal to writeNextValidLine
|
|
always @(posedge clk or posedge rst or posedge start)
|
|
begin
|
|
if(rst | start)
|
|
begin
|
|
writeOutputLine <= 0;
|
|
writeNextValidLine <= 0;
|
|
writeNextPlusOne <= 1;
|
|
getNextPlusOne <= 1;
|
|
end
|
|
else
|
|
begin
|
|
if(writeRowCount >= writeNextValidLine) //When writeRowCount becomes higher than the next valid line to read out, comptue the next valid line.
|
|
begin
|
|
if(getNextPlusOne) //Keep writeNextPlusOne
|
|
begin
|
|
writeNextPlusOne <= writeNextValidLine + 1;
|
|
end
|
|
getNextPlusOne <= 0;
|
|
writeOutputLine <= writeOutputLine + 1;
|
|
writeNextValidLine <= ((writeOutputLine*yScale + {{(OUTPUT_Y_RES_WIDTH + SCALE_INT_BITS){1'b0}}, topFracOffset}) >> SCALE_FRAC_BITS) & {{SCALE_BITS{1'b0}}, {OUTPUT_Y_RES_WIDTH{1'b1}}};
|
|
end
|
|
else
|
|
begin
|
|
getNextPlusOne <= 1;
|
|
end
|
|
end
|
|
end
|
|
|
|
reg discardInput;
|
|
reg [DISCARD_CNT_WIDTH-1:0] discardCountReg;
|
|
wire advanceWrite;
|
|
|
|
reg [1:0] writeState;
|
|
|
|
reg [INPUT_X_RES_WIDTH-1:0] writeColCount;
|
|
reg enableNextDin;
|
|
reg forceRead;
|
|
|
|
//Write state machine
|
|
//Controls writing scaler input data into the RRB
|
|
|
|
parameter WS_START = 0;
|
|
parameter WS_DISCARD = 1;
|
|
parameter WS_READ = 2;
|
|
parameter WS_DONE = 3;
|
|
|
|
//Control write and address signals to write data into ram FIFO
|
|
always @ (posedge clk or posedge rst or posedge start)
|
|
begin
|
|
if(rst | start)
|
|
begin
|
|
writeState <= WS_START;
|
|
enableNextDin <= 0;
|
|
discardInput <= 0;
|
|
readyForRead <= 0;
|
|
writeRowCount <= 0;
|
|
writeColCount <= 0;
|
|
discardCountReg <= 0;
|
|
forceRead <= 0;
|
|
end
|
|
else
|
|
begin
|
|
case (writeState)
|
|
|
|
WS_START:
|
|
begin
|
|
discardCountReg <= inputDiscardCnt;
|
|
if(inputDiscardCnt > 0)
|
|
begin
|
|
discardInput <= 1;
|
|
enableNextDin <= 1;
|
|
writeState <= WS_DISCARD;
|
|
end
|
|
else
|
|
begin
|
|
discardInput <= 0;
|
|
enableNextDin <= 1;
|
|
writeState <= WS_READ;
|
|
end
|
|
discardInput <= (inputDiscardCnt > 0) ? 1'b1 : 1'b0;
|
|
end
|
|
|
|
WS_DISCARD: //Discard pixels from input data
|
|
begin
|
|
if(dInValid)
|
|
begin
|
|
discardCountReg <= discardCountReg - 1;
|
|
if((discardCountReg - 1) == 0)
|
|
begin
|
|
discardInput <= 0;
|
|
writeState <= WS_READ;
|
|
end
|
|
end
|
|
end
|
|
|
|
WS_READ:
|
|
begin
|
|
if(dInValid & nextDin)
|
|
begin
|
|
if(writeColCount == inputXRes)
|
|
begin //Occurs on the last pixel in the line
|
|
if((writeNextValidLine == writeRowCount + 1) ||
|
|
(writeNextPlusOne == writeRowCount + 1))
|
|
begin //Next line is valid, write into buffer
|
|
discardInput <= 0;
|
|
end
|
|
else
|
|
begin //Next line is not valid, discard
|
|
discardInput <= 1;
|
|
end
|
|
|
|
//Once writeRowCount is >= 2, data is ready to start being output.
|
|
if(writeRowCount[1])
|
|
readyForRead <= 1;
|
|
|
|
if(writeRowCount == inputYRes) //When all data has been read in, stop reading.
|
|
begin
|
|
writeState <= WS_DONE;
|
|
enableNextDin <= 0;
|
|
forceRead <= 1;
|
|
end
|
|
|
|
writeColCount <= 0;
|
|
writeRowCount <= writeRowCount + 1;
|
|
end
|
|
else
|
|
begin
|
|
writeColCount <= writeColCount + 1;
|
|
end
|
|
end
|
|
end
|
|
|
|
WS_DONE:
|
|
begin
|
|
//do nothing, wait for reset
|
|
end
|
|
|
|
endcase
|
|
end
|
|
end
|
|
|
|
|
|
//Advance write whenever we have just written a valid line (discardInput == 0)
|
|
//Generate this signal one earlier than discardInput above that uses the same conditions, to advance the buffer at the right time.
|
|
assign advanceWrite = (writeColCount == inputXRes) & (discardInput == 0) & dInValid & nextDin;
|
|
assign allDataWritten = writeState == WS_DONE;
|
|
assign nextDin = (fillCount < BUFFER_SIZE) & enableNextDin;
|
|
|
|
ramFifo #(
|
|
.DATA_WIDTH( DATA_WIDTH*CHANNELS ),
|
|
.ADDRESS_WIDTH( INPUT_X_RES_WIDTH ), //Controls width of RAMs
|
|
.BUFFER_SIZE( BUFFER_SIZE ) //Number of RAMs
|
|
) ramRB (
|
|
.clk( clk ),
|
|
.rst( rst | start ),
|
|
.advanceRead1( advanceRead1 ),
|
|
.advanceRead2( advanceRead2 ),
|
|
.advanceWrite( advanceWrite ),
|
|
.forceRead( forceRead ),
|
|
|
|
.writeData( dIn ),
|
|
.writeAddress( writeColCount ),
|
|
.writeEnable( dInValid & nextDin & enableNextDin & ~discardInput ),
|
|
.fillCount( fillCount ),
|
|
|
|
.readData00( readData00 ),
|
|
.readData01( readData01 ),
|
|
.readData10( readData10 ),
|
|
.readData11( readData11 ),
|
|
.readAddress( readAddress )
|
|
);
|
|
|
|
endmodule //scaler
|
|
|
|
|
|
|
|
//---------------------------Ram FIFO (RFIFO)-----------------------------
|
|
//FIFO buffer with rams as the elements, instead of data
|
|
//One ram is filled, while two others are simultaneously read out.
|
|
//Four neighboring pixels are read out at once, at the selected RAM and one line down, and at readAddress and readAddress + 1
|
|
module ramFifo #(
|
|
parameter DATA_WIDTH = 8,
|
|
parameter ADDRESS_WIDTH = 8,
|
|
parameter BUFFER_SIZE = 2,
|
|
parameter BUFFER_SIZE_WIDTH = ((BUFFER_SIZE+1) <= 2) ? 1 : //wide enough to hold value BUFFER_SIZE + 1
|
|
((BUFFER_SIZE+1) <= 4) ? 2 :
|
|
((BUFFER_SIZE+1) <= 8) ? 3 :
|
|
((BUFFER_SIZE+1) <= 16) ? 4 :
|
|
((BUFFER_SIZE+1) <= 32) ? 5 :
|
|
((BUFFER_SIZE+1) <= 64) ? 6 : 7
|
|
)(
|
|
input wire clk,
|
|
input wire rst,
|
|
input wire advanceRead1, //Advance selected read RAM by one
|
|
input wire advanceRead2, //Advance selected read RAM by two
|
|
input wire advanceWrite, //Advance selected write RAM by one
|
|
input wire forceRead, //Disables writing to allow all data to be read out (RAM being written to cannot be read from normally)
|
|
|
|
input wire [DATA_WIDTH-1:0] writeData,
|
|
input wire [ADDRESS_WIDTH-1:0] writeAddress,
|
|
input wire writeEnable,
|
|
output reg [BUFFER_SIZE_WIDTH-1:0]
|
|
fillCount,
|
|
|
|
// yx
|
|
output wire [DATA_WIDTH-1:0] readData00, //Read from deepest RAM (earliest data), at readAddress
|
|
output wire [DATA_WIDTH-1:0] readData01, //Read from deepest RAM (earliest data), at readAddress + 1
|
|
output wire [DATA_WIDTH-1:0] readData10, //Read from second deepest RAM (second earliest data), at readAddress
|
|
output wire [DATA_WIDTH-1:0] readData11, //Read from second deepest RAM (second earliest data), at readAddress + 1
|
|
input wire [ADDRESS_WIDTH-1:0] readAddress
|
|
);
|
|
|
|
reg [BUFFER_SIZE-1:0] writeSelect;
|
|
reg [BUFFER_SIZE-1:0] readSelect;
|
|
|
|
//Read select ring register
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
readSelect <= 1;
|
|
else
|
|
begin
|
|
if(advanceRead1)
|
|
begin
|
|
readSelect <= {readSelect[BUFFER_SIZE-2 : 0], readSelect[BUFFER_SIZE-1]};
|
|
end
|
|
else if(advanceRead2)
|
|
begin
|
|
readSelect <= {readSelect[BUFFER_SIZE-3 : 0], readSelect[BUFFER_SIZE-1:BUFFER_SIZE-2]};
|
|
end
|
|
end
|
|
end
|
|
|
|
//Write select ring register
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
writeSelect <= 1;
|
|
else
|
|
begin
|
|
if(advanceWrite)
|
|
begin
|
|
writeSelect <= {writeSelect[BUFFER_SIZE-2 : 0], writeSelect[BUFFER_SIZE-1]};
|
|
end
|
|
end
|
|
end
|
|
|
|
wire [DATA_WIDTH-1:0] ramDataOutA [2**BUFFER_SIZE-1:0];
|
|
wire [DATA_WIDTH-1:0] ramDataOutB [2**BUFFER_SIZE-1:0];
|
|
|
|
//Generate to instantiate the RAMs
|
|
generate
|
|
genvar i;
|
|
for(i = 0; i < BUFFER_SIZE; i = i + 1)
|
|
begin : ram_generate
|
|
|
|
ramDualPort #(
|
|
.DATA_WIDTH( DATA_WIDTH ),
|
|
.ADDRESS_WIDTH( ADDRESS_WIDTH )
|
|
) ram_inst_i(
|
|
.clk( clk ),
|
|
|
|
//Port A is written to as well as read from. When writing, this port cannot be read from.
|
|
//As long as the buffer is large enough, this will not cause any problem.
|
|
.addrA( ((writeSelect[i] == 1'b1) && !forceRead && writeEnable) ? writeAddress : readAddress ), //&& writeEnable is
|
|
//to allow the full buffer to be used. After the buffer is filled, write is advanced, so writeSelect
|
|
//and readSelect are the same. The full buffer isn't written to, so this allows the read to work properly.
|
|
.dataA( writeData ),
|
|
.weA( ((writeSelect[i] == 1'b1) && !forceRead) ? writeEnable : 1'b0 ),
|
|
.qA( ramDataOutA[2**i] ),
|
|
|
|
.addrB( readAddress + 1 ),
|
|
.dataB( 0 ),
|
|
.weB( 1'b0 ),
|
|
.qB( ramDataOutB[2**i] )
|
|
);
|
|
end
|
|
endgenerate
|
|
|
|
//Select which ram to read from
|
|
wire [BUFFER_SIZE-1:0] readSelect0 = readSelect;
|
|
wire [BUFFER_SIZE-1:0] readSelect1 = (readSelect << 1) | readSelect[BUFFER_SIZE-1];
|
|
|
|
//Steer the output data to the right ports
|
|
assign readData00 = ramDataOutA[readSelect0];
|
|
assign readData10 = ramDataOutA[readSelect1];
|
|
assign readData01 = ramDataOutB[readSelect0];
|
|
assign readData11 = ramDataOutB[readSelect1];
|
|
|
|
//Keep track of fill level
|
|
always @(posedge clk or posedge rst)
|
|
begin
|
|
if(rst)
|
|
begin
|
|
fillCount <= 0;
|
|
end
|
|
else
|
|
begin
|
|
if(advanceWrite)
|
|
begin
|
|
if(advanceRead1)
|
|
fillCount <= fillCount;
|
|
else if(advanceRead2)
|
|
fillCount <= fillCount - 1;
|
|
else
|
|
fillCount <= fillCount + 1;
|
|
end
|
|
else
|
|
begin
|
|
if(advanceRead1)
|
|
fillCount <= fillCount - 1;
|
|
else if(advanceRead2)
|
|
fillCount <= fillCount - 2;
|
|
else
|
|
fillCount <= fillCount;
|
|
end
|
|
end
|
|
end
|
|
|
|
endmodule //ramFifo
|
|
|
|
|
|
//Dual port RAM
|
|
module ramDualPort #(
|
|
parameter DATA_WIDTH = 8,
|
|
parameter ADDRESS_WIDTH = 8
|
|
)(
|
|
input wire [(DATA_WIDTH-1):0] dataA, dataB,
|
|
input wire [(ADDRESS_WIDTH-1):0] addrA, addrB,
|
|
input wire weA, weB, clk,
|
|
output reg [(DATA_WIDTH-1):0] qA, qB
|
|
);
|
|
|
|
// Declare the RAM variable
|
|
reg [DATA_WIDTH-1:0] ram[2**ADDRESS_WIDTH-1:0];
|
|
|
|
//Port A
|
|
always @ (posedge clk)
|
|
begin
|
|
if (weA)
|
|
begin
|
|
ram[addrA] <= dataA;
|
|
qA <= dataA;
|
|
end
|
|
else
|
|
begin
|
|
qA <= ram[addrA];
|
|
end
|
|
end
|
|
|
|
//Port B
|
|
always @ (posedge clk)
|
|
begin
|
|
if (weB)
|
|
begin
|
|
ram[addrB] <= dataB;
|
|
qB <= dataB;
|
|
end
|
|
else
|
|
begin
|
|
qB <= ram[addrB];
|
|
end
|
|
end
|
|
|
|
endmodule //ramDualPort
|