I use HPC to run this program by bsub a job.
Train deep network for Deep Analogs v 0.2.11
Argument preview:
{'data': {'analogs': 15,
'dataset_class': 'AnEnDatasetSpatial',
'dataset_margin': nan,
'fcst_stations_index': None,
'fcst_variables': ['t2m', 't2m', 'q2m', 'u10m', 'v10m', 'pwat'],
'fitness_num_negative': 15,
'intermediate_file': 'nono',
'julian_weight': 0.0,
'matching_forecast_station': '*** Required when Dataset class is '
'AnEnDatasetOneToMany ***',
'obs_stations_index': None,
'obs_weights': None,
'positive_index': None,
'preprocess_workers': 72,
'test_complete_sequence': False,
'triplet_sample_method': 'fitness',
'triplet_sample_prob': 1.0},
'io': {'anchor_end': datetime.datetime(2018, 8, 16, 20, 0, tzinfo=datetime.timezone.utc),
'anchor_start': datetime.datetime(2016, 3, 29, 20, 0, tzinfo=datetime.timezone.utc),
'forecast': '/global/home/data/forecast.nc',
'observation': '/global/home/data/observations.nc',
'out': '/global/home/data',
'save_as_pure_python_module': True,
'search_end': datetime.datetime(2018, 8, 20, 20, 0, tzinfo=datetime.timezone.utc),
'search_start': datetime.datetime(2016, 3, 29, 20, 0, tzinfo=datetime.timezone.utc),
'split': datetime.datetime(2018, 5, 15, 20, 0, tzinfo=datetime.timezone.utc)},
'model': {'conv_kernel': [3],
'conv_padding': [1],
'conv_stride': [1],
'dropout': 0.2,
'forecast_grid_file': '*** Required when use_conv_lstm is True ***',
'hidden_layer_types': 'conv_lstm',
'linear_layer_last': True,
'lstm_hidden': 25,
'lstm_layers': 2,
'lstm_output': 30,
'lstm_radius': 1,
'pool_kernel': [2],
'pool_padding': [0],
'pool_stride': [2],
'range_step': 1,
'spatial_mask_height': 5,
'spatial_mask_width': 5,
'use_conv_lstm': False,
'use_naive': False},
'train': {'epochs': 50,
'lr': 0.001,
'lr_decay': 0,
'momentum': 0,
'optimizer': 'Adam',
'scaler_type': 'MinMaxScaler',
'test_batch': 32,
'test_loaders': 4,
'train_batch': 32,
'train_loaders': 4,
'train_margin': 0.9,
'use_amsgrad': False,
'use_cpu': True,
'wdecay': 0.001}}
Reading observations and forecasts ...
***** A dictionary modified for AnEn *****
Dictionary tag: Observations
Observation variables: ['t2m', 'rh2', 'd2m', 'ws10']
- ParameterNames: length 4
- Xs: length 1278
- Ys: length 1278
- Times: length 42543
- Data: shape (4, 1278, 42543)
********** End of the message ***********
0 forecast times containing NaN have been removed. 1163 forecast times left.
***** A dictionary modified for AnEn *****
Dictionary tag: Forecasts
- ParameterNames: length 6
- ParameterCirculars: length 0
- Xs: length 1278
- Ys: length 1278
- Times: length 1163
- FLTs: length 84
- Data: shape (6, 1278, 1163, 84)
********** End of the message ***********
Sorting observations in parallel ...
0%| | 0/846 [05:34<?, ?it/s]
Traceback (most recent call last):
File "/global/home/anaconda3/envs/pytorch/bin/deep_analogs_train", line 33, in <module>
sys.exit(load_entry_point('DeepAnalogs==0.2.11', 'console_scripts', 'deep_analogs_train')())
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/site-packages/DeepAnalogs-0.2.11-py3.9.egg/DeepAnalogs/train.py", line 162, in main
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/site-packages/DeepAnalogs-0.2.11-py3.9.egg/DeepAnalogs/utils.py", line 403, in sort_distance_mc
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/site-packages/tqdm/contrib/concurrent.py", line 130, in process_map
return _executor_map(ProcessPoolExecutor, fn, *iterables, **tqdm_kwargs)
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/site-packages/tqdm/contrib/concurrent.py", line 76, in _executor_map
return list(tqdm_class(ex.map(fn, *iterables, **map_args), **kwargs))
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/site-packages/tqdm/std.py", line 1180, in __iter__
for obj in iterable:
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/concurrent/futures/process.py", line 559, in _chain_from_iterable_of_lists
for element in iterable:
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/concurrent/futures/_base.py", line 608, in result_iterator
yield fs.pop().result()
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/concurrent/futures/_base.py", line 445, in result
return self.__get_result()
File "/global/home/anaconda3/envs/pytorch/lib/python3.9/concurrent/futures/_base.py", line 390, in __get_result
raise self._exception
concurrent.futures.process.BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.