- http://www.drugbank.ca/downloads#structures
- ./dat/approved.txt
-
script
./script/count_atms.py
-
result
./dat/approved_heavy_atom_num.txt ./dat/elements.txt
-
pdb –> uniprt
./dat/pdbsws_chain.txt
-
drug bank id –> uniprt ids
- ./script/crawler.py
- ./dat/drugid_uniprturls.dat
-
uniprt id –> pdb id
- ./script/uniprt_pdb.py
- ./dat/uniprt_to_pdb.dat
-
drug bank id –> pdb id
- ./script/query_pdb.py
- ./dat/drugid_pdb.dat
- 82773 pdbs
- 138078 chains
-
table
import pickle import pandas as pd dat_ifn = "./dat/drugid_pdb.dat" with open(dat_ifn, 'r') as f: dat = pickle.load(f) dset = pd.DataFrame({key: list(val) for key, val in dat.iteritems()})
-
DONE Extract the ligand atoms from the CIF files
- ./script/qmol_CIF2PDB.pl (not compatible with latest perl modules)
- work/jaydy/dat/fda_pdb_mb
-
DONE Cluster drug ligands
-
cmd
$ cd ~/Workspace/Bitbucket/fda-approved-drugs/dat/ $ python /home/jaydy/Workspace/Bitbucket/geauxcluster/src/geauxcluster.py -i approved.txt -o approved.txt.clusters -k DRUGBANK_ID
-
script ./script/clustering.py
-
-
DONE Compare the pdb ligand with drug bank ligand
- calculate tanimoto coefficient using Kcombu
- ./script/calculateTanimota.py
- pkcombu Segmentation fault for
- DB01049
- to check pkcombu -A /work/jaydy/working/kcombu_run/DB01049_fda.sdf -B /work/jaydy/dat/fda_pdb_mb/ib/3ibdA.LG3.pdb
- DB00707
- refuse proteins if their ligand's tanimoto < 0.9
- ./script/calculateTanimoto.py
- TANI_DIR = "/work/jaydy/working/tanimoto"
- calculate tanimoto coefficient using Kcombu
-
DONE add missing atoms
./script/fixPrt.py
-
DONE clean the proteins sequences
python ./script/pdb_to_fasta.py > run.sh sh ./script/run.sh
- ctrip, heavy atom model
- /project/michal/apps/jackal_64bit/bin/ctrip
- ./script/pdb2pdb.pl move pdb sequence to start at 1
- ./script/pdb2fasta.pl convert pdb to fasta
- ./script/fasta2fasta.pl break the lines at 80
- ctrip, heavy atom model
-
DONE cd-hit to cluster the sequences
- do not count the prt with seq length > 600
- cluster python ./script/cluster_seq.py > ./script/cluster_seq.sh cd /work/jaydy/working/cluster sh /home/jaydy/Workspace/Bitbucket/fda-approved-drugs/script/cluster_seq.sh
- collect /home/jaydy/Workspace/Bitbucket/fda-approved-drugs/script/collect.py
-
TODO
- original data set of 1554 fda-approved drugs
-
script ./script/count_atms.py ./dat/drug_size.txt
-
result
count 1554.000000 mean 26.257400 std 18.172654 min 1.000000 25% 18.000000 50% 23.000000 75% 30.000000 max 419.000000 -
one atom drugs
DrugBank ID Chem DB01356 Li DB01370 Al DB01592 Fe DB01593 Zn -
2 σ range (0, 62.6)
-
1 σ range (8, 44)
-
- after processing
-
script ./script/drug_size.py
-
result
statistics #HeavyAtom count 197.000000 mean 23.928934 std 12.470170 min 6.000000 25% 16.000000 50% 21.000000 75% 29.000000 max 93.000000 -
within the range of (8, 44)
-
dat ./dat/representative_drugs.csv
stats #HeavyAtom count 186.000000 mean 22.639785 std 8.456204 min 8.000000 25% 16.000000 50% 21.000000 75% 29.000000 max 44.000000 -
3D structures
-
protein bound ligands
import itertools import shutil import os import pandas as pd import cPickle represents_ifn = "./dat/representative_drugs.csv" dat_ifn = "./dat/dat.pkl" with open(dat_ifn, 'r') as f: dset = cPickle.load(f) represents = pd.read_csv(represents_ifn, index_col=0) inter = dset[dset.DRUGBANK_ID.isin(represents.id)] inter.to_csv("./dat/fda_drugs_prt_lig.csv") drugs = {} for index, row in inter.iterrows(): drugs[row.DRUGBANK_ID] = row.ProteinBoundLig.split() complexes = set(itertools.chain(*drugs.values())) print "# complexes %d" % (len(complexes)) proteins = set([complex_id.split('.')[0][:4] for complex_id in complexes]) print "# proteins %d" % (len(proteins)) # retrive the 3D structures DAT_DIR = "/work/jaydy/dat/fda_pdb_mb" STRUCTURE_DIR = "/work/jaydy/dat/fda_drug_structures" def getPaths(ligand_id): mid_two = ligand_id[1:3] ligand_path = os.path.join(DAT_DIR, mid_two, ligand_id) prt_id = ligand_id.split('.')[0] + '.pdb' prt_path = os.path.join(DAT_DIR, mid_two, prt_id) return ligand_path, prt_path def copyFiles(drug_id): dat_dir = os.path.join(STRUCTURE_DIR, drug_id) os.makedirs(dat_dir) for ligand_id in drugs[drug_id]: ligand_path, prt_path = getPaths(ligand_id) shutil.copy(ligand_path, dat_dir) shutil.copy(prt_path, dat_dir) print drug_id, "done" for drug_id in drugs: copyFiles(drug_id)
-
-
-
- original data set of 1554 fda-approved drugs