Cleaned up and added check for "Total wall time" in the output

This commit is contained in:
Trung Nguyen
2024-08-13 11:03:35 -05:00
parent bbd72a8960
commit 700a22b7cd

View File

@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
''' '''
UPDATE: July 26, 2024: UPDATE: August 13, 2024:
Launching the LAMMPS binary under testing using a configuration defined in a yaml file (e.g. config.yaml). Launching the LAMMPS binary under testing using a configuration defined in a yaml file (e.g. config.yaml).
Comparing the output thermo with that in the existing log file (with the same nprocs) Comparing the output thermo with that in the existing log file (with the same nprocs)
+ data in the log files are extracted and converted into yaml data structure + data in the log files are extracted and converted into yaml data structure
@ -19,7 +19,7 @@ With the current features, users can:
Limitations: Limitations:
- input scripts use thermo style multi (e.g., examples/peptide) do not work with the expected thermo output format - input scripts use thermo style multi (e.g., examples/peptide) do not work with the expected thermo output format
- input scripts that require partition runs (e.g. examples/neb) need a separate config file, e.g. "args: --partition 3x1" - input scripts that require partition runs (e.g. examples/neb) need a separate config file, e.g. args: "--partition 3x1"
- testing accelerator packages (GPU, INTEL, KOKKOS, OPENMP) need separate config files, "args: -sf omp -pk omp 4" - testing accelerator packages (GPU, INTEL, KOKKOS, OPENMP) need separate config files, "args: -sf omp -pk omp 4"
TODO: TODO:
@ -60,16 +60,15 @@ Example usage:
of run_tests.py simultaneously. of run_tests.py simultaneously.
''' '''
import os from argparse import ArgumentParser
import datetime import datetime
import fnmatch import fnmatch
import logging
import os
import re import re
import subprocess import subprocess
from argparse import ArgumentParser #from multiprocessing import Pool
from multiprocessing import Pool
import logging
# need "pip install numpy pyyaml" # need "pip install numpy pyyaml"
import numpy as np import numpy as np
import yaml import yaml
@ -82,6 +81,9 @@ try:
except ImportError: except ImportError:
from yaml import SafeLoader as Loader from yaml import SafeLoader as Loader
'''
data structure to store the test result
'''
class TestResult: class TestResult:
def __init__(self, name, output=None, time=None, checks=0, status=None): def __init__(self, name, output=None, time=None, checks=0, status=None):
self.name = name self.name = name
@ -105,22 +107,6 @@ class TestResult:
stat : a dictionary that lists the number of passed, skipped, failed tests stat : a dictionary that lists the number of passed, skipped, failed tests
progress_file: yaml file that stores the tested input script and status progress_file: yaml file that stores the tested input script and status
last_progress: the dictionary that shows the status of the last tests last_progress: the dictionary that shows the status of the last tests
NOTE:
To map a function to individual workers:
def func(input1, input2, output_buf):
# do smth
return result
# args is a list of num_workers tuples, each tuple contains the arguments passed to the function executed by a worker
args = []
for i in range(num_workers):
args.append((input1, input2, output_buf))
with Pool(num_workers) as pool:
results = pool.starmap(func, args)
''' '''
def iterate(lmp_binary, input_folder, input_list, config, results, progress_file, last_progress=None, output_buf=None): def iterate(lmp_binary, input_folder, input_list, config, results, progress_file, last_progress=None, output_buf=None):
@ -284,11 +270,13 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
# check if the output contains ERROR # check if the output contains ERROR
if "ERROR" in output: if "ERROR" in output:
cmd_str = "grep ERROR log.lammps" error_line = ""
p = subprocess.run(cmd_str, shell=True, text=True, capture_output=True) for line in output:
error_line = p.stdout.split('\n')[0] if "ERROR" in line:
error_line = line
break
logger.info(f" The run terminated with {input_test} gives the following output:") logger.info(f" The run terminated with {input_test} gives the following output:")
logger.info(f" {error_line}") logger.info(f" {error_line}")
if "Unrecognized" in output: if "Unrecognized" in output:
result.status = "error, unrecognized command, package not installed" result.status = "error, unrecognized command, package not installed"
elif "Unknown" in output: elif "Unknown" in output:
@ -309,6 +297,17 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
continue continue
# if there is no ERROR in the output, then there is something irregular in the run # if there is no ERROR in the output, then there is something irregular in the run
if "Total wall time" not in output:
logger.info(f" ERROR: no Total wall time in the output.\n")
logger.info(f"\n{input_test}:")
logger.info(f"\n Output:\n{output}")
logger.info(f"\n Error:\n{error}")
progress.write(f"{input}: {{ folder: {input_folder}, status: \"error, no Total wall time in the output.\" }}\n")
progress.close()
num_error = num_error + 1
test_id = test_id + 1
continue
if "Step" not in output or "Loop" not in output: if "Step" not in output or "Loop" not in output:
logger.info(f" ERROR: no Step nor Loop in the output.\n") logger.info(f" ERROR: no Step nor Loop in the output.\n")
logger.info(f"\n{input_test}:") logger.info(f"\n{input_test}:")
@ -332,11 +331,11 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
test_id = test_id + 1 test_id = test_id + 1
continue continue
else: else:
# save a copy of the log file # save a copy of the log file for further inspection
cmd_str = f"cp log.lammps log.{basename}.{nprocs}" cmd_str = f"cp log.lammps log.{basename}.{nprocs}"
p = subprocess.run(cmd_str, shell=True, text=True, capture_output=True) p = subprocess.run(cmd_str, shell=True, text=True, capture_output=True)
# process thermo output in log.lammps from the run # parse thermo output in log.lammps from the run
thermo = extract_data_to_yaml("log.lammps") thermo = extract_data_to_yaml("log.lammps")
num_runs = len(thermo) num_runs = len(thermo)
@ -365,10 +364,12 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
# At this point, the run completed without trivial errors, proceed with numerical checks # At this point, the run completed without trivial errors, proceed with numerical checks
# check if there is a reference log file for this input # check if there is a reference log file for this input
if logfile_exist: if logfile_exist:
# parse the thermo output in reference log file
thermo_ref = extract_data_to_yaml(thermo_ref_file) thermo_ref = extract_data_to_yaml(thermo_ref_file)
if thermo_ref: if thermo_ref:
num_runs_ref = len(thermo_ref) num_runs_ref = len(thermo_ref)
else: else:
# dictionary is empty
logger.info(f" ERROR: Error parsing the reference log file {thermo_ref_file}.") logger.info(f" ERROR: Error parsing the reference log file {thermo_ref_file}.")
result.status = "skipped numerical checks due to parsing the reference log file" result.status = "skipped numerical checks due to parsing the reference log file"
results.append(result) results.append(result)
@ -381,13 +382,14 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
msg = f" Cannot find the reference log file for {input_test} with the expected format log.[date].{basename}.*.[nprocs]" msg = f" Cannot find the reference log file for {input_test} with the expected format log.[date].{basename}.*.[nprocs]"
logger.info(msg) logger.info(msg)
print(msg) print(msg)
# try to read in the thermo yaml output from the working directory # attempt to read in the thermo yaml output from the working directory (the following section will be deprecated)
thermo_ref_file = 'thermo.' + input + '.yaml' thermo_ref_file = 'thermo.' + input + '.yaml'
file_exist = os.path.isfile(thermo_ref_file) file_exist = os.path.isfile(thermo_ref_file)
if file_exist == True: if file_exist == True:
thermo_ref = extract_thermo(thermo_ref_file) thermo_ref = extract_thermo(thermo_ref_file)
num_runs_ref = len(thermo_ref) num_runs_ref = len(thermo_ref)
else: else:
# mostly will come to here if the reference log file does not exist
logger.info(f" {thermo_ref_file} also does not exist in the working directory.") logger.info(f" {thermo_ref_file} also does not exist in the working directory.")
result.status = "skipped due to missing the reference log file" result.status = "skipped due to missing the reference log file"
results.append(result) results.append(result)
@ -400,6 +402,7 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
logger.info(f" Comparing thermo output from log.lammps against the reference log file {thermo_ref_file}") logger.info(f" Comparing thermo output from log.lammps against the reference log file {thermo_ref_file}")
# check if the number of runs matches with that in the reference log file # check if the number of runs matches with that in the reference log file
# maybe due to some changes to the input where the ref log file is not updated yet
if num_runs != num_runs_ref: if num_runs != num_runs_ref:
logger.info(f" ERROR: Number of runs in log.lammps ({num_runs}) is different from that in the reference log ({num_runs_ref})." logger.info(f" ERROR: Number of runs in log.lammps ({num_runs}) is different from that in the reference log ({num_runs_ref})."
" Check README in the folder, possibly due to using mpirun with partitions or parsing the wrong reference log file.") " Check README in the folder, possibly due to using mpirun with partitions or parsing the wrong reference log file.")
@ -411,7 +414,9 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
test_id = test_id + 1 test_id = test_id + 1
continue continue
# check if the number of fields match with that in the reference log file in the first run for early exit # check if the number of fields match with that in the reference log file in the first run
# due to some changes to the input where the ref log file is not updated yet
# for early exit
num_fields = len(thermo[0]['keywords']) num_fields = len(thermo[0]['keywords'])
num_fields_ref = len(thermo_ref[0]['keywords']) num_fields_ref = len(thermo_ref[0]['keywords'])
if num_fields != num_fields_ref: if num_fields != num_fields_ref:
@ -543,7 +548,7 @@ def iterate(lmp_binary, input_folder, input_list, config, results, progress_file
# check if memleak detects from valgrind run (need to replace "mpirun" -> valgrind --leak-check=yes mpirun") # check if memleak detects from valgrind run (need to replace "mpirun" -> valgrind --leak-check=yes mpirun")
msg = "completed" msg = "completed"
if 'valgrind' in config['mpiexec']: if 'valgrind' in config['mpiexec']:
if "All heap blocks were free" in error: if "All heap blocks were freed" in error:
msg += ", no memory leak" msg += ", no memory leak"
else: else:
msg += ", memory leaks detected" msg += ", memory leaks detected"
@ -705,6 +710,21 @@ def execute(lmp_binary, config, input_file_name, generate_ref_yaml=False):
''' '''
split a list into a list of N sublists split a list into a list of N sublists
NOTE:
To map a function to individual workers with multiprocessing.Pool:
def func(input1, input2, output_buf):
# do smth
return result
# args is a list of num_workers tuples, each tuple contains the arguments passed to the function executed by a worker
args = []
for i in range(num_workers):
args.append((input1, input2, output_buf))
with Pool(num_workers) as pool:
results = pool.starmap(func, args)
''' '''
def divide_into_N(original_list, N): def divide_into_N(original_list, N):
size = np.ceil(len(original_list) / N) size = np.ceil(len(original_list) / N)
@ -1002,7 +1022,6 @@ if __name__ == "__main__":
with Pool(num_workers) as pool: with Pool(num_workers) as pool:
results = pool.starmap(func, args) results = pool.starmap(func, args)
''' '''
for directory in example_subfolders: for directory in example_subfolders: