I have implemented a simple windowing operation in Halide-HLS and it works fine. The pipeline code is attached below.
In the generated HLS code ("hls_target.cpp") there is an instance of linebuffer of dimensions 66 by 66 like this:
1- Why 66 x 66 and not 64 x 64? I do not think it is the result of using Halide boundary condition, since data is streamed to the pipeline after boundary condition.
2- Why instantiating line buffer this big? Since for 3 x 3 windowing operation on a tile in streaming manner, just 3 rows (64 pixels each) of data would be sufficient to start windowing.
3- Vivado-HLS synthesis report shows that it infers 2 instances of 18Kb block RAM for the line buffer. I can not relate this amount of memory to the instantiated linebuffer. Any idea?
#include "Halide.h"
#include <stdio.h>
using namespace Halide;
Var x("x"), y("y"), z("z"), c("c");
Var xo("xo"), yo("yo"), xi("xi"), yi("yi");
class MyPipeline {
public:
ImageParam Input_Image;
Param<uint16_t> hotPixelStrength;
Func output;
Func hw_output;
std::vector<Argument> args;
Func Input;
Func sum_x;
Func sum_xy;
Func Conv;
MyPipeline()
: Input_Image(UInt(16), 2),
hw_output("hw_output"),
output("output")
{
Input = Halide::BoundaryConditions::constant_exterior(Input_Image, 0);
sum_x(x, y) = (Input(x - 1, y) + Input(x, y) + Input(x + 1, y));
sum_xy(x, y) = (sum_x(x, y-1) + sum_x(x, y) + sum_x(x, y+1) - Input(x, y));
Conv(x, y) = (sum_xy(x, y)) >> 3;
hw_output(x,y) = select(Input(x,y) > hotPixelStrength, Conv(x, y), Input(x,y));
output(x,y) = hw_output(x,y);
// Arguments
args = {Input_Image, hotPixelStrength};
}
void compile_cpu() {
std::cout << "\ncompiling cpu code..." << std::endl;
output.tile(x, y, xo, yo, xi, yi, 64, 64);
output.compile_to_header("pipeline_native.h", args, "pipeline_native");
output.compile_to_object("pipeline_native.o", args, "pipeline_native");
}
void compile_hls() {
std::cout << "\ncompiling HLS code..." << std::endl;
output.tile(x, y, xo, yo, xi, yi, 64, 64);
hw_output.compute_at(output, xo);
Input.compute_at(output, xo);
hw_output.tile(x, y, xo, yo, xi, yi, 64, 64).accelerate({Input}, xi, xo);
//output.print_loop_nest();
// Create the target for HLS simulation
Target hls_target = get_target_from_environment();
hls_target.set_feature(Target::CPlusPlusMangling);
output.compile_to_lowered_stmt("pipeline_hls.ir.html", args, HTML, hls_target);
output.compile_to_hls("pipeline_hls.cpp", args, "pipeline_hls", hls_target);
output.compile_to_header("pipeline_hls.h", args, "pipeline_hls", hls_target);
std::vector<Target::Feature> features({Target::Zynq});
Target target(Target::Linux, Target::ARM, 32, features);
output.compile_to_zynq_c("pipeline_zynq.c", args, "pipeline_zynq", target);
output.compile_to_header("pipeline_zynq.h", args, "pipeline_zynq", target);
output.compile_to_object("pipeline_zynq.o", args, "pipeline_zynq", target);
output.compile_to_lowered_stmt("pipeline_zynq.ir.html", args, HTML, target);
output.compile_to_assembly("pipeline_zynq.s", args, "pipeline_zynq", target);
}
};
int main(int argc, char **argv) {
MyPipeline p1;
p1.compile_cpu();
MyPipeline p2;
p2.compile_hls();
return 0;
}