Comments (18)
The future format may be something like "extension [-|gzip] offset magic_values".
Extending the signature format will break the existing signatures...
Currently, you have to deal with file_gz.c, example
diff -ruw testdisk/src/file_gz.c ../testdisk-7.1-WIP/src/file_gz.c
--- testdisk/src/file_gz.c 2018-03-22 13:20:13.628017471 +0100
+++ ../testdisk-7.1-WIP/src/file_gz.c 2018-04-10 11:22:25.121483486 +0200
@@ -177,6 +177,12 @@
file_recovery_new->min_filesize=22;
file_recovery_new->time=le32(gz->mtime);
file_recovery_new->file_rename=&file_rename_gz;
+ if(memcmp(buffer_uncompr, "BAM\1", 4)==0)
+ {
+ /* https://github.com/samtools/hts-specs SAM/BAM and related high-throughput sequencing file formats */
+ file_recovery_new->extension="bam";
+ return 1;
+ }
if(memcmp(buffer_uncompr, "PVP ", 4)==0)
{
/* php Video Pro */
from testdisk.
Thanks @cgsecurity - maybe to keep previous signature files without breaking changes, incorporating a photorec.compress.sig
might be an option to add compressed formats (the second field should contain a valid compression, such gzip, and maybe also others such bz2 and so on). This is quite important for bioinformatics, which most of the formats are compressed.
In addition, it looks like PhotoRec pulls out BAM files as separated gzip
. Could it be possible to detect when several gzip
are concatenated into a block-compressed file (bgzip
)?
from testdisk.
Here there are some magic patterns for bioinformatics: https://github.com/lindenb/magic/tree/master/patterns
Maybe that could be implemented in photorec (or an extension of it) to recover bioinformatics data...
from testdisk.
Do you mean that PhotoRec recover a single bam file as several gzip files ? If it's the case, please try to reproduce the problem with photorec /d recup_dir /cmd sample.bam search
, if you get several gz files, please share the file sample.
from testdisk.
Using this BAM (https://github.com/broadinstitute/gatk/blob/master/src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam) produces the following result with your command (without the fix proposed in your previous comment):
-rw-r--r-- 1 daniel staff 15M Apr 10 12:31 f0000000.gz
-rw-r--r-- 1 daniel staff 4.1M Apr 10 12:31 f0030938.gz
-rw-r--r-- 1 daniel staff 4.4M Apr 10 12:31 f0039252.gz
-rw-r--r-- 1 daniel staff 39M Apr 10 12:31 f0048275.gz
-rw-r--r-- 1 daniel staff 1.4M Apr 10 12:31 f0128504.gz
-rw-r--r-- 1 daniel staff 6.2M Apr 10 12:31 f0131315.gz
-rw-r--r-- 1 daniel staff 5.1M Apr 10 12:31 f0143990.gz
-rw-r--r-- 1 daniel staff 815K Apr 10 12:31 f0154341.gz
-rw-r--r-- 1 daniel staff 3.1K Apr 10 12:31 report.xml
With the fix for BAM files, the only difference is that the f0000000.gz
is re-named to f0000000.bam
.
from testdisk.
While recovering some real case data, I am realizing that it is not only a problem with the BAM format, but also with any file compressed with bgzip.
from testdisk.
I have start working on handling bgzip:
--- testdisk/src/file_gz.c 2018-03-22 13:20:13.628017471 +0100
+++ testdisk-7.1-WIP/src/file_gz.c 2018-04-10 14:00:00.000000000 +0200
@@ -36,7 +36,6 @@
#include "file_gz.h"
static void register_header_check_gz(file_stat_t *file_stat);
-static int header_check_gz(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new);
static void file_rename_gz(file_recovery_t *file_recovery);
extern const file_hint_t file_hint_doc;
@@ -59,7 +58,6 @@
uint8_t os;
} __attribute__ ((gcc_struct, __packed__));
-static const unsigned char gz_header_magic[3]= {0x1F, 0x8B, 0x08};
/* flags:
bit 0 FTEXT
bit 1 FHCRC
@@ -76,9 +74,38 @@
#define GZ_FNAME 8
#define GZ_FCOMMENT 0x10
-static void register_header_check_gz(file_stat_t *file_stat)
+static void file_check_bgzf(file_recovery_t *file_recovery)
{
- register_header_check(0, gz_header_magic,sizeof(gz_header_magic), &header_check_gz, file_stat);
+}
+
+static int header_check_bgzf(const unsigned char *buffer, const unsigned char *buffer_uncompr, const unsigned int buffer_size, file_recovery_t *file_recovery_new)
+{
+ const struct gzip_header *gz=(const struct gzip_header *)buffer;
+ reset_file_recovery(file_recovery_new);
+ file_recovery_new->min_filesize=22;
+ file_recovery_new->time=le32(gz->mtime);
+ file_recovery_new->file_rename=&file_rename_gz;
+ file_recovery_new->file_check=&file_check_bgzf;
+ if(memcmp(buffer_uncompr, "BAI\1", 4)==0)
+ {
+ /* https://github.com/samtools/hts-specs SAM/BAM and related high-throughput sequencing file formats */
+ file_recovery_new->extension="bai";
+ return 1;
+ }
+ if(memcmp(buffer_uncompr, "BAM\1", 4)==0)
+ {
+ /* https://github.com/samtools/hts-specs SAM/BAM and related high-throughput sequencing file formats */
+ file_recovery_new->extension="bam";
+ return 1;
+ }
+ if(memcmp(buffer_uncompr, "CSI\1", 4)==0)
+ {
+ /* https://github.com/samtools/hts-specs SAM/BAM and related high-throughput sequencing file formats */
+ file_recovery_new->extension="csi";
+ return 1;
+ }
+ file_recovery_new->extension=file_hint_gz.extension;
+ return 1;
}
static int header_check_gz(const unsigned char *buffer, const unsigned int buffer_size, const unsigned int safe_header_only, const file_recovery_t *file_recovery, file_recovery_t *file_recovery_new)
@@ -86,6 +113,7 @@
unsigned int off=10;
const unsigned int flags=buffer[3];
const struct gzip_header *gz=(const struct gzip_header *)buffer;
+ int bgzf=0;
/* gzip file format:
* a 10-byte header, containing a magic number, a version number and a timestamp
* optional extra headers, such as the original file name,
@@ -106,6 +134,8 @@
{
off+=2;
off+=buffer[10]|(buffer[11]<<8);
+ if(buffer[12]=='B' && buffer[13]=='C' && buffer[14]==2 && buffer[15]==0)
+ bgzf=1;
}
if((flags&GZ_FNAME)!=0)
{
@@ -133,6 +163,11 @@
if(header_ignored_adv(file_recovery, file_recovery_new)==0)
return 0;
}
+ if(file_recovery->file_check==&file_check_bgzf)
+ {
+ header_ignored(file_recovery_new);
+ return 0;
+ }
#if defined(HAVE_ZLIB_H) && defined(HAVE_LIBZ)
{
static const unsigned char schematic_header[12]={ 0x0a, 0x00, 0x09,
@@ -173,6 +208,10 @@
if(d_stream.total_out < 16)
return 0;
buffer_uncompr[d_stream.total_out]='\0';
+ if(bgzf!=0)
+ {
+ return header_check_bgzf(buffer, buffer_uncompr, d_stream.total_out, file_recovery_new);
+ }
reset_file_recovery(file_recovery_new);
file_recovery_new->min_filesize=22;
file_recovery_new->time=le32(gz->mtime);
@@ -291,3 +330,9 @@
return "none";
#endif
}
+
+static void register_header_check_gz(file_stat_t *file_stat)
+{
+ static const unsigned char gz_header_magic[3]= {0x1F, 0x8B, 0x08};
+ register_header_check(0, gz_header_magic,sizeof(gz_header_magic), &header_check_gz, file_stat);
+}
I don't know if bai and csi files are also compressed. Can you work upon this patch ?
from testdisk.
Many thanks for looking into this. I will check your patch as soon as possible (I haven't even look into the code, because this is a quick answer). Do you have the patch in your previous comment commited to some repository (either a branch here or in the official one)? That will be useful for me to test with your changes.
In the meantime, some hints about bioinformatic formats to help you working on it:
- Answering your question: BAM, BAI and CSI are bgzip compressed files. Nevertheless, look below some problems with the indexes.
- bgzip compression is not only expected in those formats, but in other tab-delimited files - this might be important for recovering properly other bgzip files. My recommendation is to set the
bgz
extension if the file is detected to be block-compressed and not part of the supported BAM-related files. - A recommendation for index files: there are two conventions for the name (
bam.bai
andbai
, andbam.csi
,csi
, and evencram.csi
), but I recommend to stick to the simpler one (bai
,csi
) to be on the safe side (a csi index might be associated not with a BAM file).
For your information and to make easier your progress here, I implemented in my fork (see the branch https://github.com/magicDGS/testdisk/tree/dgs_bioinf_files) some of the common bioinformatics formats that might be compressed. That can be a reference for your fix.
from testdisk.
Using the patch I re-run the command for checking the CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam file, and it does not create any file, just the following report.xml:
<?xml version='1.0' encoding='UTF-8'?>
<dfxml xmloutputversion='1.0'>
<metadata
xmlns='http://www.forensicswiki.org/wiki/Category:Digital_Forensics_XML'
xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance'
xmlns:dc='http://purl.org/dc/elements/1.1/'>
<dc:type>Carve Report</dc:type>
</metadata>
<creator>
<package>PhotoRec</package>
<version>7.1-WIP</version>
<build_environment>
<compiler>GCC 4.2</compiler>
<library name='libext2fs' version='none'/>
<library name='libewf' version='none'/>
<library name='libjpeg' version='none'/>
<library name='libntfs' version='none'/>
<library name='zlib' version='1.2.5'/>
</build_environment>
<execution_environment>
<os_sysname>Darwin</os_sysname>
<os_release>15.6.0</os_release>
<os_version>Darwin Kernel Version 15.6.0: Tue Jan 9 20:12:05 PST 2018; root:xnu-3248.73.5~1/RELEASE_X86_64</os_version>
<host>i122mc132.vu-wien.ac.at</host>
<arch>x86_64</arch>
<uid>502</uid>
<start_time>2018-04-10T16:12:50+0200</start_time>
</execution_environment>
</creator>
<source>
<image_filename>/Users/daniel/workspaces/gatk_magicdgs/src/test/resources/large/CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam</image_filename>
<sectorsize>512</sectorsize>
<image_size>79856849</image_size>
<volume>
<byte_runs>
<byte_run offset='0' img_offset='0' len='79856849'/>
</byte_runs>
</volume>
</source>
<configuration>
</configuration>
</dfxml>
from testdisk.
Can you check the dev branch ?
from testdisk.
Sorry for my previous comment, I am running on the same computer photorec for a failing disk without getting gzip files and thus the run that I showed is picking up that configuration. Is it possible to pass a different configuration to a different photorec run?
Once I figure out how to run it without killing the process, I will check the dev branch. Thanks!
from testdisk.
Ok, I found the way using photorec /d recup_dir /cmd CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam fileopt,everything,enable,search
. The tests that I did:
- Using the current brew version (7.0):
-rw-r--r-- 1 daniel staff 15M Apr 11 11:38 f0000000.gz
-rw-r--r-- 1 daniel staff 4.1M Apr 11 11:38 f0030938.gz
-rw-r--r-- 1 daniel staff 4.4M Apr 11 11:38 f0039252.gz
-rw-r--r-- 1 daniel staff 39M Apr 11 11:38 f0048275.gz
-rw-r--r-- 1 daniel staff 1.4M Apr 11 11:38 f0128504.gz
-rw-r--r-- 1 daniel staff 6.2M Apr 11 11:38 f0131315.gz
-rw-r--r-- 1 daniel staff 5.1M Apr 11 11:38 f0143990.gz
-rw-r--r-- 1 daniel staff 815K Apr 11 11:38 f0154341.gz
-rw-r--r-- 1 daniel staff 3.0K Apr 11 11:38 report.xml
- Using the patch applied to my branch (I don't know if it is the same as the dev branch):
-rw-r--r-- 1 daniel staff 76M Apr 11 11:39 f0000000.bam
-rw-r--r-- 1 daniel staff 1.6K Apr 11 11:39 report.xml
- Using the dev branch:
-rw-r--r-- 1 daniel staff 76M Apr 11 11:46 f0000000.bam
-rw-r--r-- 1 daniel staff 1.6K Apr 11 11:46 report.xml
And the MD5 is the same for all the files: 1cb7aa6facf25bb759b1e1d00dd19a3d
For testing if bgzip for a non-BAM file is also split or not, I took the BAM file, convert it to plain text using samtools and bgzip it. The command was samtools view -h CEUTrio.HiSeq.WGS.b37.NA12878.20.21.bam | bgzip -c > CEUTrio.HiSeq.WGS.b37.NA12878.20.21.sam.bgz
.
In this case, the result was:
- Using brew version:
-rw-r--r-- 1 daniel staff 6.3M Apr 11 11:49 f0000000.gz
-rw-r--r-- 1 daniel staff 16M Apr 11 11:49 f0012983.gz
-rw-r--r-- 1 daniel staff 4.9M Apr 11 11:49 f0045914.gz
-rw-r--r-- 1 daniel staff 7.6M Apr 11 11:49 f0055896.gz
-rw-r--r-- 1 daniel staff 5.1M Apr 11 11:49 f0071394.gz
-rw-r--r-- 1 daniel staff 556K Apr 11 11:49 f0081749.gz
-rw-r--r-- 1 daniel staff 11M Apr 11 11:49 f0082860.gz
-rw-r--r-- 1 daniel staff 5.6M Apr 11 11:49 f0106065.gz
-rw-r--r-- 1 daniel staff 1.4M Apr 11 11:49 f0117549.gz
-rw-r--r-- 1 daniel staff 9.7M Apr 11 11:49 f0120481.gz
-rw-r--r-- 1 daniel staff 883K Apr 11 11:49 f0140341.gz
-rw-r--r-- 1 daniel staff 216K Apr 11 11:49 f0142107.gz
-rw-r--r-- 1 daniel staff 1.9M Apr 11 11:49 f0142539.gz
-rw-r--r-- 1 daniel staff 4.0K Apr 11 11:49 report.xml
- Using dev branch version:
-rw-r--r-- 1 daniel staff 72M Apr 11 11:49 f0000000.gz
-rw-r--r-- 1 daniel staff 1.6K Apr 11 11:49 report.xml
So it also works for other block-compressed files where the format is not known (in your branch, BAM/BAI/CSI). Nevertheless, I recommend that the file extension used in this case is bgz
, to show that it was detected as a block-compressed file. It will be easier for recovery, because knowing if it was detected as bgzip or gzip might help identifying files (e.g., some formats in bioinformatics are compressed with gzip, such FASTA, and other formats should always be compressed as bgzip).
Thanks a lot for the work done here.
from testdisk.
Can you change cdde957#diff-bcb8aa815b8b17b77dac79ecc7656e8eR107 to set the extension to bgz
? I think that will be perfect (I tested that normal gzipped files are set to gz
).
Thanks a lot for the help and the quick fix!
from testdisk.
Done, I have modified the extension in the dev branch.
from testdisk.
Thank you! This software is awesome and it is great your commitment for support new formats!
Is there any plan to include the changes in the next release? And to make a new release soon?
from testdisk.
I have uploaded a new 7.1-WIP (source + binaries) with those changes.
from testdisk.
Thanks!
from testdisk.
7.1 will succeed to 7.1-WIP when it will be released
from testdisk.
Related Issues (20)
- Nikon NEF files HOT 1
- Photorec cannot resume if path to .dd/.img was containing space characters before saving session
- ZOOM file extension HOT 4
- [filetype support] add freecad files supported *.fcstd HOT 3
- Please add f2fs filesystem support for the great testdisk (not photorec) HOT 1
- Does TestDisk support apfs? HOT 2
- Suggestions of messages HOT 1
- Tell testdisk from which sector to start HOT 4
- cross compile windows application HOT 2
- ReiserFS support install instruction HOT 6
- Testdisk recovers same files again and again in linux HOT 3
- Add ability to refresh directory in photorec
- Thank you!
- QPhotoRec: Improve File Format Window HOT 1
- when recover ibd file ,the ibd file is too large HOT 1
- NetCDF data recovery HOT 4
- README: incomplete list of PhotoRec formats
- Scriptable mode, similarly to Unix fdisk's sdisk binary
- Recover Partition from MBR partition table; which partition type to choose? HOT 1
- build failure in apfs_common.h HOT 2
Recommend Projects
-
React
A declarative, efficient, and flexible JavaScript library for building user interfaces.
-
Vue.js
🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.
-
Typescript
TypeScript is a superset of JavaScript that compiles to clean JavaScript output.
-
TensorFlow
An Open Source Machine Learning Framework for Everyone
-
Django
The Web framework for perfectionists with deadlines.
-
Laravel
A PHP framework for web artisans
-
D3
Bring data to life with SVG, Canvas and HTML. 📊📈🎉
-
Recommend Topics
-
javascript
JavaScript (JS) is a lightweight interpreted programming language with first-class functions.
-
web
Some thing interesting about web. New door for the world.
-
server
A server is a program made to process requests and deliver data to clients.
-
Machine learning
Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.
-
Visualization
Some thing interesting about visualization, use data art
-
Game
Some thing interesting about game, make everyone happy.
Recommend Org
-
Facebook
We are working to build community through open source technology. NB: members must have two-factor auth.
-
Microsoft
Open source projects and samples from Microsoft.
-
Google
Google ❤️ Open Source for everyone.
-
Alibaba
Alibaba Open Source for everyone
-
D3
Data-Driven Documents codes.
-
Tencent
China tencent open source team.
from testdisk.