Comments (15)
Hi @shrit,
I wanted to share some results from benchmarking I've been doing. I compared the Padé Approximant method (see: https://en.wikipedia.org/wiki/Pad%C3%A9_approximant) with the current Fast approximation approach. The Padé method showed a small but consistent performance improvement.
#include <chrono>
#include <armadillo>
#include <iostream>
#include <fstream>
arma::mat generateRandomMatrix(double mean, double stddev, int rows, int cols) {
arma::mat output = arma::randu(rows, cols);
output = (output * stddev * std::sqrt(12.0)) + mean - stddev * std::sqrt(3.0);
return output;
}
// Function to calculate the exponential of a matrix using Armadillo
arma::mat calculateExponentialArmadillo(arma::mat& output) {
auto start = std::chrono::high_resolution_clock::now();
output = arma::exp(output * (-1));
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
std::cout << "Duration arma exp: " << duration.count() << " nano seconds" << std::endl;
return output;
}
// Function to calculate Padé approximant for exp(-x) for x positive
double padeApproximant(double x) {
static constexpr double num_coeffs[] = {120, -60, 12};
static constexpr double den_coeffs[] = {120, 60, 12};
double num = num_coeffs[0] + x * (num_coeffs[1] + x * num_coeffs[2]);
double den = den_coeffs[0] + x * (den_coeffs[1] + x * den_coeffs[2]);
return num / den;
}
// Function to calculate the exponential of a matrix using Padé approximant
arma::mat calculateExponentialPade(arma::mat& output) {
auto start = std::chrono::high_resolution_clock::now();
output.transform([](double x) {
return padeApproximant(x);
});
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
std::cout << "Duration Padé exp: " << duration.count() << " nano seconds" << std::endl;
return output;
}
// Function to calculate the exponential of a matrix using a fast approximation
arma::mat calculateExponentialFast(arma::mat& output) {
auto start = std::chrono::high_resolution_clock::now();
output.transform([](double x) {
static constexpr double A0 = 1.0;
static constexpr double A1 = 0.125;
static constexpr double A2 = 0.0078125;
static constexpr double A3 = 0.00032552083;
static constexpr double A4 = 1.0172526e-5;
if (x < 13.0) {
double y = A0 + x * (A1 + x * (A2 + x * (A3 + x * A4)));
y *= y;
y *= y;
y *= y;
y = 1 / y;
return y;
}
return 0.0;
});
auto stop = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
std::cout << "Duration new exp: " << duration.count() << " nano seconds" << std::endl;
return output;
}
int main() {
double mean = 0.0;
double stddev = 2.0;
std::ofstream file("benchmark_results.csv");
file << "MatrixSize,ArmadilloDuration,FastDuration,PadeDuration\n";
std::vector<int> sizes = {10, 50, 100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000};
for (int size : sizes) {
arma::mat output = generateRandomMatrix(mean, stddev, size, size);
arma::mat output_2 = output;
arma::mat output_3 = output;
std::cout << "\n\nSize: " << size << std::endl;
auto start = std::chrono::high_resolution_clock::now();
output = calculateExponentialArmadillo(output);
auto stop = std::chrono::high_resolution_clock::now();
auto duration_arma = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
start = std::chrono::high_resolution_clock::now();
output_2 = calculateExponentialFast(output_2);
stop = std::chrono::high_resolution_clock::now();
auto duration_fast = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
start = std::chrono::high_resolution_clock::now();
output_3 = calculateExponentialPade(output_3);
stop = std::chrono::high_resolution_clock::now();
auto duration_pade = std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start);
file << size << "," << duration_arma.count() << "," << duration_fast.count() << "," << duration_pade.count() << "\n";
}
file.close();
}
I also tried to optimize the dropout algorithm in your second comment with the find_and_fill and conv_to_mat functions. I'd love to get your feedback on these two optimized versions I came up with:
#include <cmath>
#include <fstream>
#include <vector>
// Function to time a given operation and return the duration
template<typename Func>
auto time_operation(Func operation) {
auto start = std::chrono::high_resolution_clock::now();
operation();
auto stop = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count();
}
int main()
{
double ratio = 0.2;
std::ofstream file("benchmark_results_dropout.csv");
file << "Matrix Size,mlpack,omar,find_and_fill,conv_to_mat\n";
std::vector<int> sizes = {10, 50, 100, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000};
for (int size : sizes) {
arma::mat output = arma::randu(size, size);
arma::mat output_2 = output;
arma::mat output_3 = output;
arma::mat output_4 = output;
auto mlpack_time = time_operation([&]{ output.transform([&] (double val) { return (val > ratio); }); });
auto omar_time = time_operation([&]{ output_2 = arma::sign(output_2 - ratio); output_2.clamp(0, 1); });
auto find_and_fill_time = time_operation([&]{ arma::uvec indices = arma::find(output_3 > ratio); output_3.zeros(); output_3.elem(indices).fill(1); });
auto conv_to_mat_time = time_operation([&]{ output_4 = arma::conv_to<arma::mat>::from(output_4 > ratio); });
file << size << "," << mlpack_time << "," << omar_time << "," << find_and_fill_time << "," << conv_to_mat_time << "\n";
}
file.close();
return 0;
}
What do you think about the results? Are they even relevant regarding the minimal time improvements?
from mlpack.
We do need a generic implementation that will work with Bandicoot and that is what this issue is about, but since this is such an important piece of code I think having a specialized Armadillo version is worthwhile. We can just use template specializations or SFINAE to do it, so we use the faster approach when the user is using Armadillo and the generic one otherwise.
I'm interested in the approximation level of the Pade approximant approach, @MarkFischinger can you comment on that?
from mlpack.
btw, I really like the fact of using the template parameter as a function pointer.
This is really a nice way of doing it đŻ
// Function to time a given operation and return the duration
template<typename Func>
auto time_operation(Func operation) {
auto start = std::chrono::high_resolution_clock::now();
operation();
auto stop = std::chrono::high_resolution_clock::now();
return std::chrono::duration_cast<std::chrono::nanoseconds>(stop - start).count();
}
from mlpack.
Hi @rcurtin,
the first graph shows the absolute error using:
std::abs(actual - fastApprox);
std::abs(actual - padeApprox);
@shrit I have some concerns about the expected range of
arma::mat output = arma::randn(1000, 1000, arma::distr_param(0, 2));
if (x < 13.0) {...}
In this context, the likelihood of generating a number greater than
Once I'm clear on the range of X values we're dealing with, I plan to run a few more backtests with more variance before opening that PR :)
from mlpack.
@MarkFischinger would you open a PR and replace the current transform function for the dropout with find and fill
?
I never had problem training mlpack on various numbers of datasets, also this is for an activation function which is basically at the end of the neural network, and most if not all of weights distributions are close to zero with small standard deviation.
would you also open a PR for padé approximation ? and then see if all the tests we have will pass or not
from mlpack.
I think the X >= 13
limit is just to give a shortcut for large values. It doesn't have to be 13, but I suspect it won't make a huge difference for neural network training anyway as a large X
value implies a very high activation of the neuron, so it is probably true that small differences there don't matter much. You could try on the mnist_cnn
example or similar with different limit values to see (you will have to ensure that the random seed is set to the same thing to compare accurately).
from mlpack.
Interestingly, positioning the if statement at the beginning of only the fast approximate significantly reduced the cycle count by nearly half
Now that is strange and mysterious. I wonder if looking at the generated assembly could give any better indication of what is going on, but, I could also accept that "processors are complicated and hard to explain sometimes". Better branch prediction could be an explanation for sure. I don't know if it's worth digging super deep---ideally we don't want an implementation that is tuned for a very specific processor but that doesn't generalize very well.
It may be worth trying again on a different system (I can provide a few if you like) and seeing if the results you have generalize, or if it's possible that the method you are using with RDTSC is noisy for an unrelated reason, or something like that.
Regarding the Padé approximant, I noticed that the non-scaled version performs faster up to about
x = 1.3
, maintaining similar accuracy.
Do you mean to say here that the number of cycles that it takes to compute the function depends on the value of x
?
Overall I do think it would be worthwhile to try an "overall" wall clock test on the MNIST examples, or something like this, which will exercise the code in an environment more faithful to how it will actually be used in practice. That may change the situation for what is available in cache and what isn't, and other effects like that.
Thanks for the deep digging here. đ
from mlpack.
Here is another one, with a benchmark
#include <chrono>
#include <armadillo>
int main()
{
arma::mat output = arma::randu(10, 10);
arma::mat output_2 = output;
output.print("original matrix");
double ratio = 0.2;
auto start_mlpack = std::chrono::high_resolution_clock::now();
output.transform([&] (double val) { return (val > ratio); });
auto stop_mlpack = std::chrono::high_resolution_clock::now();
output.print("dropout matrix mlpack: ");
auto start_omar = std::chrono::high_resolution_clock::now();
output_2 = arma::sign(output_2 - ratio);
output_2.clamp(0, 1);
auto stop_omar = std::chrono::high_resolution_clock::now();
output_2.print("dropout matrix omar: ");
auto duration_mlpack = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_mlpack - start_mlpack);
std::cout << "duration mlpack : " << duration_mlpack.count() << " nano seconds" << std::endl;
auto duration_omar = std::chrono::duration_cast<std::chrono::nanoseconds>(stop_omar - start_omar);
std::cout << "duration omar : " << duration_omar.count() << " nano seconds" << std::endl;
}
n = 10 x 10
duration mlpack : 738 nano seconds
duration omar : 3248 nano seconds
n = 100 x 100
duration mlpack : 55383 nano seconds
duration omar : 216340 nano seconds
from mlpack.
better implementations are welcomed đ
from mlpack.
@MarkFischinger the idea is not to find a better implementation, but instead is to get rid of the usage of the transform function from armadillo because we can not no longer use it in mlpack code base
nice try though
from mlpack.
@Shirt,
Oh, my bad, I don't know how I missed that :/
@rcurtin,
The Padé approximant offers a more accurate representation of
For the benchmarks, I chose the coefficients, numerator polynomial num_coeffs[] = {120, -60, 12}
den_coeffs[] = {120, 60, 12}
from mlpack.
Oh, nice, this looks good. In the first graph you posted, is that absolute error or relative error?
from mlpack.
@rcurtin if we agree on using the Padé Approximation, @MarkFischinger would you by then open a PR ? and replace the current solution with Padé ? Also would you do the same for Dropout layer ? by using the following:
auto find_and_fill_time = time_operation([&]{ arma::uvec indices = arma::find(output_3 > ratio); output_3.zeros(); output_3.elem(indices).fill(1); });
@rcurtin looking at the graph I think it worth having one instead of two function for the dropout, the loss in speed is not that major. What do you think ?
from mlpack.
Yeah, I think the Padé approximation is an improvement overall. @zoq you did the original fast version, what do you think?
For the dropout layer I think we are actually better off changing the strategy overall. We currently generate a randu matrix and then threshold it to generate a mask. I think it might be better if we could either generate the mask on-the-fly, or generate a 0/1 matrix with the right probabilities directly. Anyway, for now I would just go with one solution, since I think more optimization is possible there, even with a general implementation that will work for both Bandicoot and Armadillo.
from mlpack.
@shrit After spending several days researching and testing, I decided to switch to using RDTSC and spreading the execution to multiple cores so that I get more average_iterations
using OpenMP, for more accurate time measurement. I was experiencing inconsistent runtimes in nanoseconds when using chrono::high_resolution_clock
, even when averaging multiple iterations, due to CPU variance (learned a lot about the CPU ;)
Regarding the Padé approximant, I noticed that the non-scaled version performs faster up to about
double lambda2(double x) {
if (x >= 13.0) {
return 0.0;
}
static constexpr double A0 = 1.0, A1 = 0.125, A2 = 0.0078125, A3 = 0.00032552083, A4 = 1.0172526e-5;
double y = A0 + x * (A1 + x * (A2 + x * (A3 + x * A4)));
y *= y; y *= y; y *= y;
return 1 / y;
}
Lambda1 Cycles: 32, Average Relative Error: 0.035419
Lambda2 Cycles: 18, Average Relative Error: 0.035419
I also experimented with another implementation using inline assembly and scaled padĂ© versions, but it didnât yield faster or more accurate results.
For the Find and Fill algorithm, I ran the RDTSC benchmark, which confirmed the previous metrics and provided a clearer visualization:
I'll update the Find and Fill method as soon as possible and am looking forward to trying out your idea, @rcurtin
from mlpack.
Related Issues (20)
- stb_image_write warning while compiling HOT 1
- Can't train a model having bias addition layer Add() HOT 8
- Reverse Convolution? HOT 6
- Documentation issue
- [R] - `verbose` argument has no effect HOT 1
- Get rid of `arma::fill::zeros` when we upgrade the minimum armadillo version HOT 5
- Document `internal_compact::` name space for `arma::fill` HOT 2
- [R] - Global option for 'verbose' argument HOT 5
- Add `.prepare` script to have r-universe automatically build new nightlies HOT 1
- bfd.h:35:2: error: #error config.h must be included before this header HOT 4
- Any ideas about Random Forest regressor? HOT 2
- Switch from `-j 2` to `-j ${nproc}` HOT 2
- dimensionality mismatch: Decision Tree CLI with both -t and -T specified
- [R] - Should the returning model object gain a class?
- NumPy 2.0 support
- [R] Switch `sprintf` to `snprintf` HOT 4
- Physics-Informed Neural Network possible with MLPack? HOT 1
- 1-D Convolution issues about time series data HOT 1
- Using Header-Only mlpack via CMake FetchContent and Automatic Dependency Download
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
đ Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. đđđ
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google â€ïž Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from mlpack.