Coder Social home page Coder Social logo

Comments (34)

YoannRandon avatar YoannRandon commented on May 30, 2024 1

I also data stored on another server which is mount to my training machine, maybe this is the problem.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024
{
    "D": {
        "dropout": false,
        "n_layers": 3,
        "ndf": 64,
        "netDs": [
            "projected_d",
            "basic",
            "vision_aided"
        ],
        "norm": "instance",
        "proj_config_segformer": "models/configs/segformer/segformer_config_b0.json",
        "proj_interp": 256,
        "proj_network_type": "vitsmall",
        "proj_weight_segformer": "models/configs/segformer/pretrain/segformer_mit-b0.pth",
        "spectral": false,
        "vision_aided_backbones": "clip+dino+swin"
    },
    "G": {
        "attn_nb_mask_attn": 10,
        "attn_nb_mask_input": 1,
        "config_segformer": "models/configs/segformer/segformer_config_b0.json",
        "dropout": false,
        "nblocks": 9,
        "netE": "resnet_256",
        "netG": "segformer_attn_conv",
        "ngf": 64,
        "norm": "instance",
        "padding_type": "reflect"
    },
    "alg": {
        "gan": {
            "lambda": 1.0
        },
        "cut": {
            "HDCE_gamma": 1.0,
            "HDCE_gamma_min": 1.0,
            "MSE_idt": false,
            "flip_equivariance": false,
            "lambda_MSE_idt": 1.0,
            "lambda_NCE": 1.0,
            "lambda_SRC": 0.0,
            "nce_T": 0.07,
            "nce_idt": true,
            "nce_includes_all_negatives_from_minibatch": false,
            "nce_layers": "0,4,8,12,16",
            "nce_loss": "monce",
            "netF": "mlp_sample",
            "netF_dropout": false,
            "netF_nc": 256,
            "netF_norm": "instance",
            "num_patches": 256
        }
    },
    "data": {
        "online_creation": {
            "color_mask_A": false,
            "crop_delta_A": 50,
            "crop_delta_B": 15,
            "crop_size_A": 512,
            "crop_size_B": 512,
            "load_size_A": [],
            "load_size_B": [],
            "mask_delta_A": [
                50
            ],
            "mask_delta_B": [
                15
            ],
            "mask_random_offset_A": [
                0.0
            ],
            "mask_random_offset_B": [
                0.0
            ],
            "mask_square_A": false,
            "mask_square_B": false,
            "rand_mask_A": false
        },
        "crop_size": 512,
        "dataset_mode": "unaligned_labeled_mask_online",
        "direction": "AtoB",
        "inverted_mask": false,
        "load_size": 512,
        "max_dataset_size": 1000000000,
        "num_threads": 4,
        "online_context_pixels": 0,
        "online_fixed_mask_size": -1,
        "online_select_category": -1,
        "online_single_bbox": false,
        "preprocess": "resize_and_crop",
        "relative_paths": true,
        "sanitize_paths": false
    },
    "f_s": {
        "all_classes_as_one": false,
        "class_weights": [],
        "config_segformer": "models/configs/segformer/segformer_config_b0.json",
        "dropout": false,
        "net": "unet",
        "nf": 64,
        "semantic_nclasses": 2,
        "semantic_threshold": 1.0,
        "weight_sam": "",
        "weight_segformer": ""
    },
    "cls": {
        "all_classes_as_one": false,
        "class_weights": [],
        "config_segformer": "models/configs/segformer/segformer_config_b0.py",
        "dropout": false,
        "net": "vgg",
        "nf": 64,
        "semantic_nclasses": 2,
        "semantic_threshold": 1.0,
        "weight_segformer": ""
    },
    "output": {
        "display": {
            "G_attention_masks": false,
            "aim_port": 53800,
            "aim_server": "http://localhost",
            "diff_fake_real": false,
            "env": "mario2sonic",
            "freq": 200,
            "id": 1,
            "ncols": 0,
            "networks": false,
            "type": [
                "visdom"
            ],
            "visdom_port": 8097,
            "visdom_server": "http://localhost",
            "winsize": 256
        },
        "no_html": false,
        "print_freq": 200,
        "update_html_freq": 1000,
        "verbose": false
    },
    "model": {
        "init_gain": 0.02,
        "init_type": "normal",
        "input_nc": 3,
        "multimodal": false,
        "output_nc": 3
    },
    "train": {
        "sem": {
            "cls_B": false,
            "cls_lambda": 1.0,
            "cls_pretrained": false,
            "cls_template": "basic",
            "idt": false,
            "lr_cls": 0.0002,
            "lr_f_s": 0.0002,
            "mask_lambda": 1.0,
            "net_output": false,
            "use_label_B": true
        },
        "mask": {
            "charbonnier_eps": 1e-06,
            "compute_miou": false,
            "disjoint_f_s": false,
            "f_s_B": true,
            "for_removal": false,
            "lambda_out_mask": 10.0,
            "loss_out_mask": "L1",
            "miou_every": 1000,
            "no_train_f_s_A": false,
            "out_mask": true
        },
        "D_accuracy_every": 1000,
        "D_lr": 0.0001,
        "G_ema": true,
        "G_ema_beta": 0.999,
        "G_lr": 0.0002,
        "batch_size": 2,
        "beta1": 0.9,
        "beta2": 0.999,
        "cls_l1_regression": false,
        "cls_regression": false,
        "compute_D_accuracy": false,
        "compute_metrics": false,
        "compute_metrics_test": false,
        "continue": false,
        "epoch": "latest",
        "epoch_count": 1,
        "export_jit": false,
        "gan_mode": "lsgan",
        "iter_size": 1,
        "load_iter": 0,
        "lr_decay_iters": 50,
        "lr_policy": "linear",
        "metrics_every": 1000,
        "mm_lambda_z": 0.5,
        "mm_nz": 8,
        "n_epochs": 50,
        "n_epochs_decay": 50,
        "nb_img_max_fid": 1000000000,
        "optim": "adam",
        "pool_size": 50,
        "save_by_iter": false,
        "save_epoch_freq": 1,
        "save_latest_freq": 5000,
        "semantic_cls": false,
        "semantic_mask": true
    },
    "dataaug": {
        "APA": false,
        "APA_every": 4,
        "APA_nimg": 50,
        "APA_p": 0,
        "APA_target": 0.6,
        "D_diffusion": false,
        "D_diffusion_every": 4,
        "D_label_smooth": false,
        "D_noise": 0.0,
        "affine": 0.0,
        "affine_scale_max": 1.2,
        "affine_scale_min": 0.8,
        "affine_shear": 45,
        "affine_translate": 0.2,
        "diff_aug_policy": "",
        "diff_aug_proba": 0.5,
        "imgaug": false,
        "no_flip": false,
        "no_rotate": true
    },
    "checkpoints_dir": "/home/shared/EC5/FA12_synthetic_data/tmp_checkpoint_save/",
    "dataroot": "/home/shared/EC5/FA12_synthetic_data/ec5_fa26_work/Oktalse2bdd100k_midi/",
    "ddp_port": "12355",
    "gpu_ids": "0",
    "model_type": "cut",
    "name": "Oktalse2bdd100k_midi",
    "phase": "train",
    "suffix": "",
    "test_batch_size": 1,
    "warning_mode": false,
    "with_amp": false,
    "with_tf32": false,
    "with_torch_compile": false
}

from joligen.

beniz avatar beniz commented on May 30, 2024

Hello, can you provide the output instead ?

TLTR: make sure your dataset is fine, use the --data_sanitize_paths option.

If it works with 2, it does work with n gpus. What can happen in multi-gpu and that we are aware of, is that if one of the dataloader fails, i.e. because of a broken dataset (missing image, bbox, ...), it can lock the whole dataloader indefinitely.
In this case, running long enough on a single GPU shall surface the error more easily.
Note that they are two reasons you may not see the error (yet) with 2 GPUs: you'd need to run longer, and dataloader do shuffle data, thus randomizing the chances of encountering the error quickly.

Best action at this point is using --data_sanitize_paths, that checks every single image pair before training. This may take some time but the sanitized version of the paths.txt files are stored in the dataset for automatic reuse.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

maybe the problem comes from the multiprocess launched when the code is used

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

Is it requiered to have the same extension for each dataset (I have png and jpg format for repectively domain A and B) ?

from joligen.

beniz avatar beniz commented on May 30, 2024

It doesn't matter

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

It seems that every data are 'corrupted', I use the same format as shown in the example, I don't understand what is the problem.
Domain A refert to Synthetic data and Domain B refert to real data. I obtained this with a dataset I already used for training and where I didn't encounter any problem during the training.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

Is "unaligned_labeled_mask_online" the right mode for training using bounding box ?

from joligen.

beniz avatar beniz commented on May 30, 2024

image

You need to update your version to the latest commit, this has been fixed recently. The drawback of living on the edge :)

from joligen.

beniz avatar beniz commented on May 30, 2024

It seems that every data are 'corrupted', I use the same format as shown in the example, I don't understand what is the problem. Domain A refert to Synthetic data and Domain B refert to real data. I obtained this with a dataset I already used for training and where I didn't encounter any problem during the training.

This may be that JG cannot read the images or bbox correctly, can you list the first few lines of paths.txt ? Or even better share a 'light' version of the dataset with just few images/bboxes in it ?

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

image

from joligen.

beniz avatar beniz commented on May 30, 2024

I also data stored on another server which is mount to my training machine, maybe this is the problem.

You need the JG docker to have access to this directory. Easiest way to check "by hand" is to run the docker in interactive mode and call a shell in it.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

I'am not using docker for the training.
Will this problem be solved if i bring the dataset to my training machine ?

from joligen.

beniz avatar beniz commented on May 30, 2024

Will this problem be solved if i bring the dataset to my training machine ?

I don´t know, if this is a system issue, it needs to be solved somehow. I would upgrade to the latest version and check that the files are readable.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

ok, I'll give it a try. thanks

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

from joligen.

beniz avatar beniz commented on May 30, 2024

Your /home/ubuntu/checkpoint_model cannot be accessed. The error is bypassed by JG, I can get it to fail irrevocably. Check your dataset location and checkpoint dir access rights.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

I have this error when i launch the training with one gpu.
seems to be the reason why images in the dataset are said to be wrong.
I encounter this error when i use the last version of joliGEN.
does the data format change from the example in the joliGEN documentation?

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

Oktalse2bdd100k_aube_mini.zip

here is an example of the dataset i use for training. Is there something wrong with the format ?

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

there is a lot of multiprocess launch at the same time when I use the code. Is there a way to reduce their number ?

from joligen.

beniz avatar beniz commented on May 30, 2024

--data_num_threads 1

from joligen.

beniz avatar beniz commented on May 30, 2024

@YoannRandon got the zip file, there are several issues with the dataset, it works once fixed:

  • classes in bbox files must start from 1, see script to fix your full dataset below
import glob                                                                                              
import shutil                                                                                            
                                                                                                         
allfiles = glob.glob('/path/to/Oktalse2bdd100k_aube_mini/*/bbox/*.txt')            
print(allfiles)                                                                                          
                                                                                                         
for bboxfile in allfiles:                                                                                
    bbf = open(bboxfile, 'r')                                                                            
    bbof = open(bboxfile + '.fixed', 'w')                                                                
    for lines in bbf:                                                                                    
        elts = lines.split(' ')                                                                          
        elts[0] = str(int(elts[0]) + 1)                                                                  
        bbof.write(' '.join([e for e in elts]))                                                          
    bbf.close()                                                                                          
    bbof.close()                                                                                         
    shutil.move(bboxfile + '.fixed', bboxfile)     
  • f_s_semantic_nclasses in your train_config.json must be set to 3 as in JG we consider class 0 is always the background (implicit) class

  • mask_delta must be set to an array in your train_config.json, i.e. mask_delta_A: [[50,50]] and mask_delta_B:[[15,15]]. We can't yet catch all mistakes in options, though we did improve recently, and this is the one breaking the dataloader.

  • set --train_semantic_mask in command line or add it to your train_config.json to train the semantic mask network, otherwise mask conservation loss cannot be computed and is ignored.

With these fixes, the training run does correctly start on my side.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

Thanks for these informations, it work better than before, however the vm still broke whith "Network error: Software caused connection abort" when I use 4 gpus no matter which dataset size is used. I think it may come from the driver version, could you send me the version of the nvidia driver you use for training?

image

from joligen.

beniz avatar beniz commented on May 30, 2024

535 is OK, we do use it. I've never heard of the error you are mentioning. Can you detail your setup and show the error trace ? It sounds like a system issue, not JG.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

image

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

After that, my VM can't be acces for a certain amount of time

image

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

I also have a warning message, saying delta_mask_A/B lenght should be the same as f_s_nclasses. I have delta_mask_A = [[50,50]] and f_s_nclasses = 3. tried several changed to match the size but always got this error. What am i missing?
The training start without problem when i use this conf, but i would like to know if it normal.

from joligen.

beniz avatar beniz commented on May 30, 2024

I also have a warning message, saying delta_mask_A/B lenght should be the same as f_s_nclasses.

mask_delta_{A,B} can specify different values for different classes, such as mask_delta=[[50,40],[30,30],[15,45]] for a 3 classes semantic problem. When a single [w,h] value is passed, it is automatically applied to all classes.

I'm not sure why you are specifying mask_delta in the first place so I can't comment whether it makes sense to have different values per class.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

I notice that --data_sanitize_paths only work if I use absolute path in paths.txt in each trainA/B of the dataset.
Maybe you already knew it but the example joligen documentation use logical path.

from joligen.

beniz avatar beniz commented on May 30, 2024

Are you using --data_relative_paths as well ?

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

I didn't use them, thanks for the tips. My problem as been solved : it was a problem coming from ovh. I will close this issue.

from joligen.

YoannRandon avatar YoannRandon commented on May 30, 2024

Issue solved, problem coming from ovh instance. Thanks for the several tips you gave me.

from joligen.

Related Issues (20)

Recommend Projects

  • React photo React

    A declarative, efficient, and flexible JavaScript library for building user interfaces.

  • Vue.js photo Vue.js

    🖖 Vue.js is a progressive, incrementally-adoptable JavaScript framework for building UI on the web.

  • Typescript photo Typescript

    TypeScript is a superset of JavaScript that compiles to clean JavaScript output.

  • TensorFlow photo TensorFlow

    An Open Source Machine Learning Framework for Everyone

  • Django photo Django

    The Web framework for perfectionists with deadlines.

  • D3 photo D3

    Bring data to life with SVG, Canvas and HTML. 📊📈🎉

Recommend Topics

  • javascript

    JavaScript (JS) is a lightweight interpreted programming language with first-class functions.

  • web

    Some thing interesting about web. New door for the world.

  • server

    A server is a program made to process requests and deliver data to clients.

  • Machine learning

    Machine learning is a way of modeling and interpreting data that allows a piece of software to respond intelligently.

  • Game

    Some thing interesting about game, make everyone happy.

Recommend Org

  • Facebook photo Facebook

    We are working to build community through open source technology. NB: members must have two-factor auth.

  • Microsoft photo Microsoft

    Open source projects and samples from Microsoft.

  • Google photo Google

    Google ❤️ Open Source for everyone.

  • D3 photo D3

    Data-Driven Documents codes.