satazor / js-spark-md5 Goto Github PK

Lightning fast normal and incremental md5 for javascript

License: Do What The F*ck You Want To Public License

JavaScript 83.22% HTML 13.41% CSS 3.38%

js-spark-md5's Issues

IE10 compute file md5 failed

use SparkMD5.ArrayBuffer

      spark.append(e.target.result); // append array buffer
      currentChunk++;
      if (currentChunk < chunks){

only execute one or two times and failed

Thx!

Different md5 for the same file

I am creating a simple file uploader. To be able to find if files are already uploaded I check for the MD5 hash of the file. I get 3 different hashes:

Browser - Spark MD5: 74e60eeb4ba275e2407b7006783eb034
Node - crypto: 77c0d9f9090cd219bf35c6284481f536
Node - Spark MD5: d80e3b2ead1198e6ec4fe1d9d8eb4244

I'm using the following code in the browser to get the md5 via Spark MD5.

// md5.js
import { FileChunker } from './chunker';
import SparkMD5 from 'spark-md5';

export default function calcFileMD5(file, bufferSize = 10485760, onProgress = function() {}) { // 10 mb
  if( typeof bufferSize === 'function' ) {
    onProgress = bufferSize;
    bufferSize = 10485760;
  }

  return new Promise((resolve, reject) => {
    const fileReader = new FileReader();
    const hashAlgorithm = new SparkMD5();
    const fileChunks = new FileChunker(file, bufferSize);

    fileReader.onload = function(e) {
      onProgress(fileChunks);

      const buffer = e.target.result;
      hashAlgorithm.append(buffer);

      if (fileChunks.hasNext()) {
        fileChunks.next();
        fileReader.readAsArrayBuffer(fileChunks.blob());
        return;
      }

      resolve(hashAlgorithm.end());
    };

    fileReader.onerror = function(error) {
      reject(error);
    };

    fileReader.readAsArrayBuffer(fileChunks.blob());
  });
}

// chunker.js
export default class Chunker {
  constructor(size, bufferSize) {
    this.size = size;
    this.bufferSize = bufferSize;
    this.chunks = Math.ceil(size / bufferSize);

    this.current = 0;
  }

  /**
   * Check if current chunk is the last chunk
   */
  hasNext() {
    return this.current < this.chunks;
  }

  /**
   * Move to the next chunk
   */
  next() {
    this.current += 1;
    return this.indexes();
  }

  /**
   * Start and End index of the current chunk
   */
  indexes() {
    return { start: this.start(), end: this.end() };
  }

  /**
   * Start index of the current chunk
   */
  start() {
    return this.current * this.bufferSize;
  }

  /**
   * End index of the current chunk
   */
  end() {
    return Math.min(this.start() + this.bufferSize, this.size);
  }
}

export class FileChunker extends Chunker {
  constructor(file, bufferSize) {
    if( !(file instanceof File) ) {
      throw new Error('file should be an instanceof File');
    }

    super(file.size, bufferSize);

    this.file = file;

    this.fileSlicer = File.prototype.slice || File.prototype.mozSlice || File.prototype.webkitSlice;
  }

  blob() {
    return this.fileSlicer.call(this.file, this.start(), this.end());
  }
}

On the server I use the following code:

// md5.js
const crypto = require('crypto');
const fs = require('fs');
const SparkMD5 = require('spark-md5');

function generateMD5File(file) {
  return new Promise((resolve, reject) => {
    const hash = crypto.createHash('md5');
    const stream = fs.createReadStream(file);

    stream.on('data', function (data) {
        hash.update(data);
    });

    stream.once('end', function () {
        resolve(hash.digest('hex'));
    });

    stream.once('error', function(error) {
      reject(error);
    });
  });
}
exports.generateMD5File = generateMD5File;

function generateSparkMD5File(file) {
  return new Promise((resolve, reject) => {
    const hash = new SparkMD5();
    const stream = fs.createReadStream(file);

    stream.on('data', function(data) {
        hash.append(data);
    });

    stream.once('end', function() {
        resolve(hash.end());
    });

    stream.once('error', function(error) {
      reject(error);
    });
  });
}
exports.generateSparkMD5File = generateSparkMD5File;

error checksum for files 500 MB +

I got wrong checksums for all files bigger than 500 MB in FireFox and Chrome.

Compressed files MD5 is not calculated properly

When i try to get the compressed files MD5 value its different from the value I'm getting from Java / Windows / C# MD5 calculated value. Could you please fix and let me know when its ready.

Thanks,
Viji

Wrong opera results with the new array buffer class

Investigate this issue.

Just a heads up, that the demo link in your documentation ([Demo(http://9px.ir/demo/incremental-md5.html)) points to a country that is on the US embargo list, so that any software that uses your library without minification may run into problems with US regulatory compliance. You might want to remove the link or host it elsewhere. That link was enough to get one of our mobile apps booted from the app store.

Regards,
Cory

in IE is error

SparkMD5.prototype.appendBinary = function (contents) {
    this._buff += contents;
    this._length += contents.length;

    var length = this._buff.length,
        i;

in IE is error
contents maybe null

超大体积文件多线程计算

对于一个文件来讲，如何引入web worker并行计算哈希值？本地尝试了一下，计算每个文件分块的哈希值计算挺快的，是以前的几十倍，但是不知道如何将分块的哈希值合并。（要求与文件整体哈希值一致）

Update file reader example for opera

Opera has a bug in readAsArrayBuffer(). It needs a new FileReader to be instantiated every time.
While this strategy works, it slows down every browser just because of Opera.

The example should detect Opera and only provide the slower code for it.
All other browsers should not be injured.

spark.end(); The results after two executions are different

worng code: spark.end() executed twice

console.log(spark.end())
callBack(spark.end())

true code:

 tmp_md5 = spark.end()
 callBack(tmp_md5)

I found that the results after two executions are different.
the executions code is spark.end()

I didn't look at the source code carefully, so I hope the author answers

'unescape' is deprecated

I noticed that in the toUtf8 function you are using the unescape method which was deprecated. Wouldn't using decodeURIComponent achieve the same goal?

Array returned for raw output

I tried to use your awesome library to get raw result but got array with '1746531795,941883451,402336312,-1378573203'.

I expected for string 16 chars length. What I do wrong?

You can reproduce it on you examples from test directory. Just set second parameter to true as in instruction SparkMD5.hashBinary(e.target.result, true) .

How I can get raw binary string like in php function md5(str, true); ?

md5 error ( d41d8cd98f00b204e9800998ecf8427e for all big files than 600M)

On big files: the method returen an error result

SparkMD5.hashBinary(e.target.result)

It test if file bigger than 600M, the result will be error, always get d41d8cd98f00b204e9800998ecf8427e
and if file smaller than 300M, it work well.

License issue

Hi,
I've seen that js-spark-md5 is based on work by Joseph Myers. Is it possible to know the original license for the Joseph Myers' code?
Thank you,
Alex

Getting the same md5 regardless of Blob contents

Not sure what I'm doing wrong here, following the documentation I do something like this:

var img = document.getElementById('image');
var imgBlob = base64ToBlob(getBase64Image(img));

var spark = new SparkMD5.ArrayBuffer();
spark.append(imgBlob);
console.log('md5: ' + spark.end());

No matter which image I use as the test, I always get the md5:
d0ad09ba8fe3801ac437d06ba62740d2

From the filesystem, I get the md5:

$ md5 test_image.png 
MD5 (test_image.png) = d1a87269a70cb65ed4a41d272d376052

I've also tried with:

SparkMD5.ArrayBuffer.hash(imgBlob, false);
"d41d8cd98f00b204e9800998ecf8427e"

Which always returns that md5 no matter which test image I use.

The same happens when I use a more complicated example with transferred files (non-images), I get the same md5 sum that I get here with the images.

Any idea what I'm doing wrong here?

Inconsistency between computation

Hi,

I've been playing with SparkMD5 a bit and I'm now facing some curious behavior from the library:

var a = new Uint8Array(4);
// fill up with ASCII
a[0] = 65; a[1] = 66; a[2] = 67; a[3] = 68;
var equivalentString = String.fromCharCode.apply(null, new Uint8Array(a));

var arraySpark = new SparkMD5();
arraySpark.appendBinary(a);
console.log("Array Spark: %s", arraySpark.end());
// => Array Spark: b2c0119607b38477963f46526b4d162f

var stringSpark = new SparkMD5();
stringSpark.appendBinary(equivalentString);
console.log("String Spark: %s", stringSpark.end());
// => String Spark: cb08ca4a7bb5f9683c19133a84872ca7

I was expecting the two values to be identical. Am I doing something wrong ?

From the other MD5 implementations available online, I'm expecting "cb08ca4a7bb5f9683c19133a84872ca7".

Incremental updates using existing md5 string?

I see examples for updating an md5 incrementally based on incoming chunks of the file, with append. I was wondering if there was a way to give SparkMD5 an existing md5 string of the file up to the point that you will provide the next chunk of data.

For example, if you are resuming a previously aborted operation, and you know you have computed the first 10 of 15 chunks. If you have the md5 of the first 10 chunks stored in the browser cache, you should be able to give SparkMD5 that md5 string, then send it the 11th chunk, and so on.

Is that possible?

Streams

Do you know what would be cool? Hashing a large file with streams!

The spec is coming together and just thought "hey it would be cool to use it in md5 spark!"

There is two ways you can do it. Either as a WritableStream or ReadableStream
if you would give a ReadableStream to the api then it would be in control and could handle the buffer allocation with BYOB.

However the append() operator is more like a WriteStream so you would need to provide a way to create write stream that is connected in the core

Currently you can get a ReadableStream from a the fetch api in Blink. You can also construct a ReadableStream in Blink now or you could use the web-streams-polyfill

So it would make since to just hand it over to spark in some way
I have also created a way to get a ReadableStream from blob/files with Screw-FileReader

One way you could do it is

ws = spark.createWriteStream()
blob.stream().pipeTo(ws).then(() => spark.end())

or just hand over the ReadableStream to spark in some way, cuz of right now fetch ReadableStream don't have pipeTo yet since WriteableStream is not implemented

one possible way would also be to hash it and upload it at the same time
given by the example with tee()

doesn't work in AMD environment

I have a module defined like this

define(['jquery', 'md5', 'Bacon'], function ($, md5, Bacon) {
    console.log($);
    console.log(md5);
    console.log(Bacon);
}

Assuming the paths are right this logs me some code for $ and Bacon but undefined for md5.
If I edit md5.js and change the line

define('spark-md5', factory);

define(factory);

the problem is gone.

Cannot get partial hash and continue

I wanted to compute hash the way that I would compute hash for one chunk, get it, and than continue computing hash for chunks 1+2, get it; then 1+2+3 etc. Basically I need the incremental hash computing and as I'm computing it I need the intermediate hashes. But this is not working. I'm getting wrong hashes.

Fix deprecated unescape

The toUTF8 function currently uses unescape to encode strings. However, according to MDN unescape is deprecated and decodeURIComponent/decodeURI are recommended instead.

md5 file winjs

I need to create a checksum Md5 of a file in a javascript application windows 8. Someone can help me?

small typo

On the readme it says see bellow instead of see below

source missing license header

I noticed that the JavaScript file doesn't have a license header. This seems to be a common practice amongst JavaScript libraries.
Would it be possible to add the information to the file so users can easily adhere to the project's license?
Something like:
Licensed under the WTFPL License
Thanks.

Speed comparison to js-md5

https://jsfiddle.net/crl/6uaktq3s/ it seems js-md5 is faster

Is there a sample implementation for react native expo?

Performance Issues in IE-11

When I executed the test files in Chrome, I was able to calculate the checksum of ~600 MB zip file in less than a minute. However, when I tried the same using IE-11, it took about 3 minutes for the same file.
While debugging, I observed that the following line took long time to execute in IE-11: spark.append(e.target.result);

MD5 Computed In IE11 Not Steady

Incremental MD5 needs increments of certain size

Maybe it is obvious to others, but I fell into this trap today and it took some time to find out the real reason.

See this example: https://jsfiddle.net/99Lxy7yx/1/ and choose any file over 10k in size. The computed MD5 is wrong
See this: https://jsfiddle.net/99Lxy7yx/4/ and again the same file. The MD5 is fine. Just because the chunk size is multiple of 8 (??)

This should be mentioned somewhere in the docs.

Is there some tips to mak browser not break down when calc big file ?

PC --- 4G memory , in Chrome , calcating 300MB file will break down ...

ArrayBuffer not producing same md5 as binary string

This seems to be related to the incremental interface, because I cannot reproduce this bug when I try to do an md5 hash all at once.

I don't have a small reproducible test case yet for you, but if you check out the pouchdb/pouchdb#3379 branch and run

npm install
npm run dev

then load this link in a browser, then you will see the failure.

Sorry for a lack of details; I will try to delve deeper unless you would like to investigate it yourself. (I'm a bit busy at the moment, but I thought it was prudent to let you know about this bug.)

Incorrect hash on chunks whereas merging into Blob produces correct hash

I'm having really weird behavior trying to perform an md5 checksum on data stored in IndexedDB.

All data is stored in IndexedDB at around 1mb chunk size or less.
I create a SparkMD5.ArrayBuffer() object (spark).
When I grab each chunk (in order) out of idb I add it to spark.append()
When finished I run spark.end() and always get and incorrect hash.
I take the same (ordered) array of chunks and create a blob with new Blob(collectedArray)
When I perform a fileReader.readAsArrayBuffer() on the blob, and then perform a hash on the new array buffer, I get the correct md5 checksum.

Here's an example of my code, assuming I've already collected all of the chunks from IndexedDB and placed them, in order, in the collectedArray:

        var spark = new SparkMD5.ArrayBuffer();

        collectedArray.forEach(function (data, i) {
          spark.append(data);
        });

        console.log('MD5: ' + original_md5 + ' GEN: ' + spark.end()); // here, new md5 is incorrect

        var blob = new Blob(collectedArray);

        var fileReader = new FileReader();
        fileReader.onload = function() {
            var spark2 = new SparkMD5.ArrayBuffer();
            var md52 = spark2.append(this.result);
            console.log("NEW MD5: " + spark2.end()); // here the new md5 is correct
        };
        fileReader.readAsArrayBuffer(blob);

I think this is probably some sort of user-error on my part, but have been stumped for a couple days. I'm hoping your expertise in this area might be able to point me in the right direction to how I could even go about debugging this.

It's really important because in the long run I need to perform the incremental md5sum as the data is coming into IndexedDB, so will need to use the append feature and will not have all of the pieces until the very end.

v1.0.1 is not published to NPM

Could you please publish 1.0.1 to npm?

2.0.0 break my use case because I would rather use the original hash calculation instead of the hex string.

Is there a room for improve the performance of the MD5 caculation still in SparkMD5?

Hi ,

Right now if i calculate the MD5 value using spark its taking 1 minute for 2GB file.

if i try to introduce multi threading with some 10 seconds time delay between each thread to read the file chunk and calculate the md5 append onloadend also its taking the same time. If i reduce the delay into 5 secs or less its calculating wrong MD5 values.

I wonder is there anyway to speedup the MD5 calculation for the bigger file size?

Thanks a lot.

Regards,
Viji

No tag or release for 3.0.1

A version of 3.0.1 has been released on npm, but doesn't exist within this repo. Can someone confirm what has been released for 3.0.1, and tag the appropriate sha please!

Seems like it doesn't work in Edge for cyrillic symbols

I try to hash domain "нашеправо24.рф"

should be: c6d317eda9b11438faa67a0977f0d9a1
but instead: 14effa5c5ede3c687e762efbcc7e26a7

Just in Edge. Other browsers (Chrome, Forefox) do the job well.

Register as a package in the npm

URIError: malformed URI sequence

Uncaught URIError: malformed URI sequence
    toUtf8 $:/plugins/tiddlywiki/evernote/modules/spark-md5.min.js:11
    hash $:/plugins/tiddlywiki/evernote/modules/spark-md5.min.js:11
    exports["application/enex+xml"]/</< $:/plugins/tiddlywiki/evernote/modules/enex-deserializer.js:66
    each

Provide functionality for calculating MD5 hashes of files

First of all, thanks a lot for creating this very useful library! 🙏

I recently needed to calculate an MD5 hash of a File object, and while I saw the section in the README showing how to do that, I really didn't like how much custom code it involves.

I was wondering, could maybe this functionality be part of this library? In comparison, Ruby has a Digest::MD5 class which supports calculating hash from a single string, incremental hashing in chunks, and calculating a hash from a file on disk.

Digest::MD5.hexdigest("string")
# or
md5 = Digest::MD5.new
md5.update("chunk1")
md5.update("chunk2")
md5.hexdigest
# or
Digest::MD5.file("/path/to/file").hexdigest

I took me quite a while to find a JavaScript library which simplifies reading a File object in chunks – chunked-file-reader – and it appears to work correctly (I get the same MD5 hash as with the snippet in the README here). So I came up with the following function:

function fileMD5 (file) {
  return new Promise(function (resolve, reject) {
    var spark  = new SparkMD5.ArrayBuffer(),
        reader = new ChunkedFileReader();

    reader.subscribe('chunk', function (e) {
      spark.append(e.chunk);
    });

    reader.subscribe('end', function (e) {
      var rawHash    = spark.end(true);
      var base64Hash = btoa(rawHash);

      resolve(base64Hash);
    });

    reader.readChunks(file);
  })
}

Since it took me a while to come up with this solution, I was wondering if it made sense to have that built into spark-md5.

Add tests for the array buffer class

Provide ESM build

Hi there

I wonder whether it would be possible for you to provide an ESM compatible build ... ? Why? It's all about the shift of going from require() to import / export style syntax ...

My special use case is this: I'm migrating our code base's unit tests (Jest) to ESM modules, see https://jestjs.io/docs/ecmascript-modules
For this to work, however, the packages need to conform to new ESM modules standards. For example, Jest will fail when it encounters old CommonJS style require() calls. In the case of spark-md5, it is not even possible (for me) to add it to Jest's transformIgnorePatterns (and have it be re-compiled) because (I'm not an UMD expert) it seems to me that the generated UMD file (spark-md5.js) is malformed. So for now I have to copy the whole source code of spark-md5 directly into my code base (in ESM module compatible way) as a workaround.

The new exports property in package.json helps with providing builds delivering modern ESM modules. Being backwards-compatible with old Node.js versions at the same time is easy because old Node.js versions will ignore the exports property and keep referring to package.json's main property instead (so you can still provide a CommonJS build and point to it via main property). If you don't know the new exports property, I recommend a short read of the following resources:

Best Regards,
Nicolas

Uncaught TypeError: Cannot read properties of undefined (reading 'ArrayBuffer')

when I import spark-md5 to Vue3

import it and use

but the error happened

sovle the problem, remove brace when import

Array buffer allocation failed

Platform

32 bit Windows with 4 GB memory

Error log

2018-09-29 08:57:04,468: message:{msg=[***]RangeError: Array buffer allocation failed    @ new ArrayBuffer (
<anonymous>)    @ new Uint8Array (native)    @ d (https://www.***.com/static/js/main.173fa413.js:34:14944)    @ p.ArrayBu
ffer.append (https://www.****.com/static/js/main.173fa413.js:34:17224)    @ FileReader.l.onload (https://www.***.com/
static/js/main.173fa413.js:11:7385), t=1538182625000, body=, userId=139}
2018-09-29 10:27:26,023: message:{msg=[***]RangeError: Array buffer allocation failed    @ new ArrayBuffer (
<anonymous>)    @ typedArrayConstructByLength (<anonymous>)    @ new Uint8Array (native)    @ d (https://www.***.com/stat
ic/js/main.173fa413.js:34:14944)    @ p.ArrayBuffer.append (https://www.***.com/static/js/main.173fa413.js:34:17224)    @
 FileReader.l.onload (https://www.***.com/static/js/main.173fa413.js:11:7385), t=1538188042322, body=, userId=1013}

My core code

return new Promise((resolve, reject) => {
    let chunkCount = Math.ceil(file.size / hashChunkSize)
    let currentChunk = 0
    let spark = new SparkMD5.ArrayBuffer()
    let sparkArray = []
    let fileReader = new FileReader()

    fileReader.onload = function (event) {
      sparkArray.push(SparkMD5.ArrayBuffer.hash(event.target.result))
      spark.append(event.target.result)                // Append array buffer
      currentChunk++

      if (currentChunk < chunkCount) {
        loadNext()
      } else {
        resolve({
          md5: spark.end(),
          md5Array: sparkArray
        })
      }
    }

    fileReader.onerror = () => {
      reject(new Error('read file failed'))
    }

    function loadNext() {
      let start = currentChunk * hashChunkSize
      let end = ((start + hashChunkSize) >= file.size) ? file.size : start + hashChunkSize
      fileReader.readAsArrayBuffer(blobSlice.call(file, start, end))
    }

    loadNext()
  })

It seems that spark.append(event.target.result) can't get enough memory...