Source code for dpgen.collect.collect

#!/usr/bin/env python3

import os,sys,json,glob,argparse,dpdata
import numpy as np
from dpgen.generator.run import data_system_fmt

[docs]def collect_data(target_folder, param_file, output, verbose = True, shuffle = True, merge = True) : target_folder = os.path.abspath(target_folder) output = os.path.abspath(output) # goto input cwd = os.getcwd() os.chdir(target_folder) jdata = json.load(open(param_file)) sys_configs_prefix = jdata.get('sys_configs_prefix', '') sys_configs = jdata.get('sys_configs', []) if verbose : max_str_len = max([len(str(ii)) for ii in sys_configs]) max_form_len = 16 ptr_fmt = '%%%ds %%%ds natoms %%6d nframes %%6d' % (max_str_len+5, max_form_len) # init systems init_data = [] init_data_prefix = jdata.get('init_data_prefix', '') init_data_sys = jdata.get('init_data_sys', []) for ii in init_data_sys: init_data.append(dpdata.LabeledSystem(os.path.join(init_data_prefix, ii), fmt='deepmd/npy')) # collect systems from iter dirs coll_data = {} numb_sys = len(sys_configs) model_devi_jobs = jdata.get('model_devi_jobs', {}) numb_jobs = len(model_devi_jobs) iters = ['iter.%06d' % ii for ii in range(numb_jobs)] # loop over iters to collect data for ii in range(len(iters)) : iter_data = glob.glob(os.path.join(iters[ii], '02.fp', 'data.[0-9]*[0-9]')) iter_data.sort() for jj in iter_data : sys = dpdata.LabeledSystem(jj, fmt = 'deepmd/npy') if merge: sys_str = sys.formula else: sys_str = (os.path.basename(jj).split('.')[-1]) if sys_str in coll_data.keys(): coll_data[sys_str].append(sys) else: coll_data[sys_str] = sys # print information if verbose: for ii in range(len(init_data)): print(ptr_fmt % (str(init_data_sys[ii]), init_data[ii].formula, init_data[ii].get_natoms(), init_data[ii].get_nframes() )) keys = list(coll_data.keys()) keys.sort() for ii in keys: if merge: sys_str = ii else : sys_str = str(sys_configs[int(ii)]) print(ptr_fmt % (sys_str, coll_data[ii].formula, coll_data[ii].get_natoms(), coll_data[ii].get_nframes() )) # shuffle system data if shuffle: for kk in coll_data.keys(): coll_data[kk].shuffle() # create output dir os.chdir(cwd) os.makedirs(output, exist_ok = True) # dump init data for idx,ii in enumerate(init_data): out_dir = 'init.' + (data_system_fmt % idx) ii.to('deepmd/npy', os.path.join(output, out_dir)) # dump iter data for kk in coll_data.keys(): out_dir = 'sys.%s' % kk nframes = coll_data[kk].get_nframes() coll_data[kk].to('deepmd/npy', os.path.join(output, out_dir), set_size = nframes)
# coll_data[kk].to('deepmd/npy', os.path.join(output, out_dir))
[docs]def gen_collect(args): collect_data(args.JOB_DIR, args.parameter, args.OUTPUT, verbose = args.verbose, shuffle = args.shuffle, merge = args.merge)
def _main() : parser = argparse.ArgumentParser(description='Collect data from DP-GEN iterations') parser.add_argument("JOB_DIR", type=str, help="the directory of the DP-GEN job") parser.add_argument("OUTPUT", type=str, help="the output directory of data") parser.add_argument('-p',"--parameter", type=str, default = 'param.json', help="the json file provides DP-GEN paramters, should be located in JOB_DIR") parser.add_argument('-v',"--verbose", action = 'store_true', help="print number of data in each system") parser.add_argument('-m',"--merge", action = 'store_true', help="merge the systems with the same chemical formula") parser.add_argument('-s',"--shuffle", action = 'store_true', help="shuffle the data systems") args = parser.parse_args() gen_collect(args) if __name__ == '__main__': _main()