I've created this issue as an anchor for an ongoing troubleshooting session. Any solution(s) to this issue will be documented here.
From @jeff231li:
I'm having is an I/O issue. One example I encounter is when a dask-worker has opened a *.dcd file at the start of a simulation. As the simulation progresses, the dask-worker will need to open this file and append to it the current snapshot of the system. However, it is spitting out an error because it is trying to append to an empty file (for some reason the initial header was not written to the file by OpenMM). I tried using os.flush() and f.flush() but the same problem occurs. Note that this only happens in some system (43 host-guest systems in total) and occurs randomly. We can discuss this further tomorrow but I think it is a problem with the way TSCC is setup, either hardware limitations or OS issue (due to high volume I/O).
We met today in a live session to troubleshoot. Some details:
- He is using paprika to perform host-guest free energy simulations on the paprika-integration branch.
- Using the
DaskPBSBackend
on TSCC for compute.
- Using Lustre filesystem or NFS mount for full run, both
working_directory
and storage_directory
.
Seeing issues such as the following.
Running on NFS mount
#!/usr/bin/env python
import json
import os
import shutil
from distributed import Adaptive
from openforcefield.typing.engines import smirnoff
from propertyestimator import unit
from propertyestimator.properties import HostGuestBindingAffinity
from propertyestimator.protocols.paprika import OpenMMPaprikaProtocol
from propertyestimator.backends import QueueWorkerResources, DaskPBSBackend
from propertyestimator.client import PropertyEstimatorClient, PropertyEstimatorOptions
from propertyestimator.datasets.taproom import TaproomDataSet
from propertyestimator.server import PropertyEstimatorServer
from propertyestimator.storage import LocalFileStorage
from propertyestimator.utils import setup_timestamp_logging, get_data_filename
from propertyestimator.utils.serialization import TypedJSONEncoder
from propertyestimator.workflow import WorkflowOptions
class CustomAdaptive(Adaptive):
"""A temporary work-around to attempt to fix
https://github.com/dask/distributed/issues/3154
"""
async def recommendations(self, target: int) -> dict:
"""
Make scale up/down recommendations based on current state and target
"""
await self.cluster
return await super(CustomAdaptive, self).recommendations(target)
def _get_modified_schema(workflow_options):
default_schema = HostGuestBindingAffinity.get_default_paprika_simulation_workflow_schema(workflow_options)
host_guest_protocol = OpenMMPaprikaProtocol('host_guest_free_energy_$(orientation_replicator)')
host_guest_protocol.schema = default_schema.protocols[host_guest_protocol.id]
host_guest_protocol.equilibration_timestep = 1 * unit.femtosecond
host_guest_protocol.number_of_equilibration_steps = 200000
host_protocol = OpenMMPaprikaProtocol('host')
host_protocol.schema = default_schema.protocols[host_protocol.id]
host_protocol.equilibration_timestep = 1 * unit.femtosecond
host_protocol.number_of_equilibration_steps = 200000
default_schema.protocols[host_guest_protocol.id] = host_guest_protocol.schema
default_schema.protocols[host_protocol.id] = host_protocol.schema
return default_schema
def main():
setup_timestamp_logging()
# Load in the force field
force_field = smirnoff.ForceField('smirnoff99Frosst-1.1.0.offxml',
get_data_filename('forcefield/tip3p.offxml'))
# Load in the data set, retaining only a specific host / guest pair.
host = ['acd', 'bcd']
# guest = 'bam'
data_set = TaproomDataSet()
data_set.filter_by_host_identifiers(*host)
# data_set.filter_by_guest_identifiers(guest)
# Set up the server object which run the calculations.
working_directory = 'working_directory'
storage_directory = 'storage_directory'
# Remove any existing data.
if os.path.isdir(working_directory):
shutil.rmtree(working_directory)
queue_resources = QueueWorkerResources(number_of_threads=1,
number_of_gpus=1,
preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA,
per_thread_memory_limit=4 * unit.gigabyte,
wallclock_time_limit="08:00:00")
setup_script_commands = [
'source /home/jsetiadi/.bashrc',
'conda activate propertyestimator',
f'cd /projects/gilson-kirkwood/jsetiadi/propertyestimator/full-taproom'
]
calculation_backend = DaskPBSBackend(minimum_number_of_workers=1,
maximum_number_of_workers=48,
resources_per_worker=queue_resources,
queue_name='gpu-condo',
setup_script_commands=setup_script_commands,
adaptive_interval='1000ms',
resource_line='nodes=1:ppn=2:gpuTitan',
adaptive_class=CustomAdaptive)
# Set up a backend to cache simulation data in.
storage_backend = LocalFileStorage(storage_directory)
# Spin up the server object.
PropertyEstimatorServer(calculation_backend=calculation_backend,
storage_backend=storage_backend,
working_directory=working_directory)
# Request the estimate of the host-guest binding affinity.
options = PropertyEstimatorOptions()
options.allowed_calculation_layers = ['SimulationLayer']
workflow_options = WorkflowOptions(convergence_mode=WorkflowOptions.ConvergenceMode.NoChecks)
workflow_schema = _get_modified_schema(workflow_options)
options.workflow_options = {'HostGuestBindingAffinity': {'SimulationLayer': workflow_options}}
options.workflow_schemas = {'HostGuestBindingAffinity': {'SimulationLayer': workflow_schema}}
estimator_client = PropertyEstimatorClient()
request = estimator_client.request_estimate(property_set=data_set,
force_field_source=force_field,
options=options)
# Wait for the results.
results = request.results(True, 3600)
# Save the result to file.
with open('results.json', 'wb') as file:
json_results = json.dumps(results, sort_keys=True, indent=2,
separators=(',', ': '), cls=TypedJSONEncoder)
file.write(json_results.encode('utf-8'))
if __name__ == "__main__":
main()
Gives:
18:18:38.913 INFO An exception was raised: working_directory/SimulationLayer/e34f73be-3bc7-42f0-82b1-c74f02147912/135262ac-c38f-4108-8c76-4bae641adb51:host_guest_free_energy_1 - An unhandled exception occurred: ['Traceback (most recent call last):\n', ' File "/projects/gilson-kirkwood/jsetiadi/propertyestimator/paprika_integration/propertyestimator/workflow/workflow.py", line 1264, in _execute_protocol\n output_dictionary = protocol.execute(directory, available_resources)\n', ' File "/projects/gilson-kirkwood/jsetiadi/propertyestimator/paprika_integration/propertyestimator/protocols/paprika.py", line 685, in execute\n error = self._setup(\'\', available_resources)\n', ' File "/projects/gilson-kirkwood/jsetiadi/propertyestimator/paprika_integration/propertyestimator/protocols/paprika.py", line 554, in _setup\n result = self._solvate_windows(directory, available_resources)\n', ' File "/projects/gilson-kirkwood/jsetiadi/propertyestimator/paprika_integration/propertyestimator/protocols/paprika.py", line 254, in _solvate_windows\n reference_structure_path)\n', ' File "/projects/gilson-kirkwood/jsetiadi/propertyestimator/paprika_integration/propertyestimator/protocols/paprika.py", line 754, in _add_dummy_atoms\n self._solvated_system_xml_paths[index])\n', ' File "/projects/gilson-kirkwood/jsetiadi/anaconda3_tscc/envs/propertyestimator/lib/python3.7/site-packages/paprika/setup.py", line 398, in add_dummy_atoms\n reference_structure = pmd.load_file(reference_pdb, structure=True)\n', ' File "/projects/gilson-kirkwood/jsetiadi/anaconda3_tscc/envs/propertyestimator/lib/python3.7/site-packages/parmed/formats/registry.py", line 162, in load_file\n if filename.startswith(\'http://\') or filename.startswith(\'https://\')\\\n', "AttributeError: 'NoneType' object has no attribute 'startswith'\n"]
Running on Lustre filesystem
#!/usr/bin/env python
import json
import os
import shutil
from distributed import Adaptive
from openforcefield.typing.engines import smirnoff
from propertyestimator import unit
from propertyestimator.properties import HostGuestBindingAffinity
from propertyestimator.protocols.paprika import OpenMMPaprikaProtocol
from propertyestimator.backends import QueueWorkerResources, DaskPBSBackend
from propertyestimator.client import PropertyEstimatorClient, PropertyEstimatorOptions
from propertyestimator.datasets.taproom import TaproomDataSet
from propertyestimator.server import PropertyEstimatorServer
from propertyestimator.storage import LocalFileStorage
from propertyestimator.utils import setup_timestamp_logging, get_data_filename
from propertyestimator.utils.serialization import TypedJSONEncoder
from propertyestimator.workflow import WorkflowOptions
import logging
from importlib import reload
reload(logging)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.basicConfig(
filename='propertyestimator.log',
format='%(asctime)s %(message)s',
datefmt='%Y-%m-%d %I:%M:%S %p',
)
class CustomAdaptive(Adaptive):
"""A temporary work-around to attempt to fix
https://github.com/dask/distributed/issues/3154
"""
async def recommendations(self, target: int) -> dict:
"""
Make scale up/down recommendations based on current state and target
"""
await self.cluster
return await super(CustomAdaptive, self).recommendations(target)
def _get_modified_schema(workflow_options):
default_schema = HostGuestBindingAffinity.get_default_paprika_simulation_workflow_schema(workflow_options)
host_guest_protocol = OpenMMPaprikaProtocol('host_guest_free_energy_$(orientation_replicator)')
host_guest_protocol.schema = default_schema.protocols[host_guest_protocol.id]
host_guest_protocol.equilibration_timestep = 1 * unit.femtosecond
host_guest_protocol.number_of_equilibration_steps = 200000
host_guest_protocol.number_of_production_steps = 1000000
host_guest_protocol.number_of_solvent_molecules = 2210
host_protocol = OpenMMPaprikaProtocol('host')
host_protocol.schema = default_schema.protocols[host_protocol.id]
host_protocol.equilbration_timestep = 1 * unit.femtosecond
host_protocol.number_of_equilibration_steps = 200000
host_protocol.number_of_production_steps = 1000000
host_protocol.number_of_solvent_molecules = 1500
default_schema.protocols[host_guest_protocol.id] = host_guest_protocol.schema
default_schema.protocols[host_protocol.id] = host_protocol.schema
return default_schema
def main():
setup_timestamp_logging()
# Load in the force field
force_field = smirnoff.ForceField('smirnoff99Frosst-1.1.0.offxml',
get_data_filename('forcefield/tip3p.offxml'))
# Load in the data set, retaining only a specific host / guest pair.
# host = 'bcd'
# guest = 'bam'
data_set = TaproomDataSet()
# data_set.filter_by_host_identifiers(host)
# data_set.filter_by_guest_identifiers(guest)
# Set up the server object which run the calculations.
working_directory = 'working_directory'
storage_directory = 'storage_directory'
# Remove any existing data.
if os.path.isdir(working_directory):
shutil.rmtree(working_directory)
queue_resources = QueueWorkerResources(number_of_threads=1,
number_of_gpus=1,
preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA,
per_thread_memory_limit=4 * unit.gigabyte,
wallclock_time_limit="99:00:00")
setup_script_commands = [
'source /home/jsetiadi/.bashrc',
'conda activate pe-paprika',
f'cd /oasis/tscc/scratch/jsetiadi/full-run',
'echo "Using GPU no $CUDA_VISIBLE_DEVICES"'
]
calculation_backend = DaskPBSBackend(minimum_number_of_workers=1,
maximum_number_of_workers=12,
resources_per_worker=queue_resources,
queue_name='home-mgilson',
setup_script_commands=setup_script_commands,
adaptive_interval='1000ms',
resource_line='nodes=1:ppn=3:gpu980',
adaptive_class=CustomAdaptive)
# Set up a backend to cache simulation data in.
storage_backend = LocalFileStorage(storage_directory)
# Spin up the server object.
PropertyEstimatorServer(calculation_backend=calculation_backend,
storage_backend=storage_backend,
working_directory=working_directory)
# Request the estimate of the host-guest binding affinity.
options = PropertyEstimatorOptions()
options.allowed_calculation_layers = ['SimulationLayer']
workflow_options = WorkflowOptions(convergence_mode=WorkflowOptions.ConvergenceMode.NoChecks)
workflow_schema = _get_modified_schema(workflow_options)
options.workflow_options = {'HostGuestBindingAffinity': {'SimulationLayer': workflow_options}}
options.workflow_schemas = {'HostGuestBindingAffinity': {'SimulationLayer': workflow_schema}}
estimator_client = PropertyEstimatorClient()
request = estimator_client.request_estimate(property_set=data_set,
force_field_source=force_field,
options=options)
# Wait for the results.
results = request.results(True, 10800)
# Save the result to file.
with open('results.json', 'wb') as file:
json_results = json.dumps(results, sort_keys=True, indent=2,
separators=(',', ': '), cls=TypedJSONEncoder)
file.write(json_results.encode('utf-8'))
if __name__ == "__main__":
main()
Gives:
18:48:08.021 INFO An exception was raised: working_directory/SimulationLayer/9ab84d91-427c-42a5-8a7a-46c2256ca9b3/4d727f5c-7b6f-4f45-bdfa-3c11542978dd:filter_host/4d727f5c-7b6f-4f45-bdfa-3c11542978dd:host - An unhandled exception occurred: ['Traceback (most recent call last):\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/workflow/workflow.py", line 1264, in _execute_protocol\n output_dictionary = protocol.execute(directory, available_resources)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/paprika.py", line 699, in execute\n error = self._setup(\'\', available_resources)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/paprika.py", line 568, in _setup\n result = self._solvate_windows(directory, available_resources)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/paprika.py", line 268, in _solvate_windows\n reference_structure_path)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/paprika.py", line 757, in _add_dummy_atoms\n result = build_solvated_complex_system.execute(window_directory, None)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/forcefield.py", line 371, in execute\n file.write(system_xml.encode(\'utf-8\'))\n', 'OSError: [Errno 5] Input/output error\n']
18:48:08.181 INFO Finished server request 9ab84d91-427c-42a5-8a7a-46c2256ca9b3
18:48:11.303 INFO Finished server request e028c196-0e92-419f-8368-c2c1a981d64c
18:48:13.447 INFO An exception was raised: - acd/release/windows/r002/simulations/npt_production: The simulation failed unexpectedly: ['Traceback (most recent call last):\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/simulation.py", line 732, in _simulate\n self._write_checkpoint_file(current_step, context)\n', ' File "/home/jsetiadi/propertyestimator/propertyestimator/protocols/simulation.py", line 454, in _write_checkpoint_file\n json.dump(checkpoint, file, cls=TypedJSONEncoder)\n', 'OSError: [Errno 5] Input/output error\n']
Possible workarounds
It may make sense in this case to create a scratch directory on each compute node's local storage at $TMPDIR
for both the PropertyEstimatorServer
and the dask-workers in setup_script_commands
, then set the working_directory
to point to that. This may avoid issues with rapid writes/reads on mounted network filesystems. More details in the TSCC docs.
Substitutions in the scripts above like the following may work well:
setup_script_commands = [
'source /home/jsetiadi/.bashrc',
'conda activate propertyestimator',
'mkdir -p $TMPDIR/jsetiadi/working_directory',
'cd $TMPDIR/jsetiadi/working_directory'
]
working_directory = os.path.join(os.environ['TMPDIR'], 'working_directory')
os.makedirs(working_directory)
@jeff231li, can you give the above a shot and let us know here if this addresses the errors you are seeing?