1. Prepare training/test data
try:
import torch
print("successfully imported torch")
print(torch.__version__)
except ImportError:
!! pip install torch torchvision torchaudio --user --index-url https://download.pytorch.org/whl/cpu
print("completed installing torch")successfully imported torch
2.6.0+cu124
try:
import numpy as np
print("successfully imported numpy")
print(np.__version__)
except ImportError:
!! pip install numpy --user --upgrade
print("completed installing numpy")successfully imported numpy
1.26.4
try:
from matplotlib import pyplot as plt
print("successfully imported matplotlib")
except ImportError:
!! pip install matplotlib --user --upgrade
print("completed installing matplotlib")successfully imported matplotlib
!! pip install aenet-gpr --user --upgrade
print("completed installing aenet-gpr")
! pip show aenet-gprcompleted installing aenet-gpr
Name: aenet-gpr
Version: 2.6.5
Summary: Atomistic simulation tools based on Gaussian Processes Regression
Home-page: https://github.com/atomisticnet/aenet-gpr
Author: In Won Yeu
Author-email: iy2185@columbia.edu
License: MPL-2.0
Location: /data/home/iy2185/.local/lib/python3.12/site-packages
Requires: ase
Required-by:
import os, sys, site, glob
sys.path.append(site.USER_SITE)
import aenet_gpr
print(aenet_gpr.__version__)2.6.5
from IPython.display import Image
try:
import ase.io
print("successfully imported ase")
except ImportError:
!! pip install ase --user --upgrade
print("completed installing ASE")
import ase.io
print("successfully imported ase")successfully imported ase
1. Prepare training/test data¶
We are using train/test data of H-H Lennard-Jones potential prepared in aenet-gpr/example/1_H2/
import os.path
! rm -rf 1_H2
! mkdir 1_H2
if os.path.isfile("../example/1_H2/train_set.zip"):
! unzip -oq ../example/1_H2/train_set.zip -d ./1_H2/
print("number of train data:")
! find ./1_H2/train_set/ -type f | wc -l
else:
! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train_set.zip
! unzip -oq train_set.zip -d ./1_H2/
! rm train_set.zip
print("number of train data:")
! find ./1_H2/train_set/ -type f | wc -l--2025-10-19 16:54:22-- https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train_set.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train_set.zip [following]
--2025-10-19 16:54:22-- https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train_set.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4676 (4.6K) [application/zip]
Saving to: ‘train_set.zip’
train_set.zip 100%[===================>] 4.57K --.-KB/s in 0s
2025-10-19 16:54:22 (19.0 MB/s) - ‘train_set.zip’ saved [4676/4676]
number of train data:
7
if os.path.isfile("../example/1_H2/test_set.zip"):
! unzip -oq ../example/1_H2/test_set.zip -d ./1_H2/
print("number of test data:")
! find ./1_H2/test_set/ -type f | wc -l
else:
! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/test_set.zip
! unzip -oq test_set.zip -d ./1_H2/
! rm test_set.zip
print("number of test data:")
! find ./1_H2/test_set/ -type f | wc -l--2025-10-19 16:54:23-- https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/test_set.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/test_set.zip [following]
--2025-10-19 16:54:23-- https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/test_set.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 122489 (120K) [application/zip]
Saving to: ‘test_set.zip’
test_set.zip 100%[===================>] 119.62K --.-KB/s in 0.003s
2025-10-19 16:54:23 (42.2 MB/s) - ‘test_set.zip’ saved [122489/122489]
number of test data:
200
atoms = ase.io.read("./1_H2/test_set/file_0000.xsf")
ase.io.write('H2.png', atoms)
Image("H2.png")
2-1. Train–Test with default Cartesian coordinate fingerprint¶
In addition to the reference data files, following configuration file train.in is all you need to run aenet-gpr.
Most of the contents are set to default parameters, which can also be deleted.
! rm -f train.in
if os.path.isfile("../example/1_H2/train.in"):
! cp ../example/1_H2/train.in .
else:
! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train.in--2025-10-19 16:54:26-- https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train.in
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train.in [following]
--2025-10-19 16:54:26-- https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train.in
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 292 [text/plain]
Saving to: ‘train.in’
train.in 100%[===================>] 292 --.-KB/s in 0s
2025-10-19 16:54:26 (5.52 MB/s) - ‘train.in’ saved [292/292]
! cat train.in# File path
Train_file ./1_H2/train_set/file_*.xsf
Test_file ./1_H2/test_set/file_*.xsf
# File format (default: xsf)
File_format xsf
# Descriptor (default: cartesian coordinates)
Descriptor cart
# Data filter (remove close data, default: True)
Filter False
fit_weight True
fit_scale True
! python -m aenet_gpr train.in======================================================================
aenet-GPR: surrogate GPR for GPR-ANN indirect force training
======================================================================
2025-10-19 16:54:30.
Developed by In Won Yeu
This program performs three main steps:
1. Train: Generates a GPR model using the provided structure, energy, and force data.
2. Test: Uses the generated GPR model to predict values for the test set structures.
3. Augmentation: Performs data augmentation in xsf file format, compatible with aenet-(PyTorch),
supporting a GPR-ANN training in conjunction with aenet-(PyTorch).
Each of these steps is executed once the input file (train.in) contains the keywords:
Train_file [train file path]
Test_file [test file path]
Additional_write [True]
======================================================================
Train
======================================================================
Read reference training data
Time needed for reading data: 0.024267 s
Maximum CPU memory used: 0.408844 GB
Maximum GPU memory used: 0.000000 GB
Energy data size: (7,) # (N_data, )
Force data size: (7, 2, 3) # (N_data, N_atom, 3)
----------------------------------------------------------------------
Candidate scales: tensor([0.2000, 0.4000, 0.8000])
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 35 is not positive-definite).
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 23 is not positive-definite).
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 17 is not positive-definite).
Updated scale: 0.4
Weight parameter is too high (1755.4094971801248).
Fix the weight parameter
Updated weight: 1.0
----------------------------------------------------------------------
Model train
Training parameters
{'num_data': 7, 'calculator': {'kerneltype': 'sqexp', 'scale': tensor(0.4000, dtype=torch.float64), 'weight': tensor(1., dtype=torch.float64), 'noise': tensor(1.0000e-06, dtype=torch.float64), 'noisefactor': tensor(0.5000, dtype=torch.float64), 'prior': tensor(1747.2449, dtype=torch.float64)}, 'fix_ind': None, 'pbc': False, 'species': ['H', 'H'], 'num_atom': 2}
Time needed for training: 0.448779 s
Maximum CPU memory used: 0.426407 GB
Maximum GPU memory used: 0.000000 GB
----------------------------------------------------------------------
======================================================================
Test
======================================================================
----------------------------------------------------------------------
Model evaluation for test set
Time needed for test evaluation: 1.174569 s
Maximum CPU memory used: 0.431431 GB
Maximum GPU memory used: 0.000000 GB
----------------------------------------------------------------------
GPR energy MAE (eV): 0.002753264542017503
GPR force MAE (eV/Ang): 0.10276022081180339
Saving test target to [energy_test_reference.npy] and [force_test_reference.npy]
Saving GPR prediction to [energy_test_gpr.npy], [force_test_gpr.npy], [unc_e_test_gpr.npy], and [unc_f_test_gpr.npy]
2-2. Visualize the results¶
from aenet_gpr.util import ReferenceData
train_xsf_files = glob.glob("./1_H2/train_set/file_*")
train_xsf_files.sort()
train_data = ReferenceData(structure_files=train_xsf_files, file_format='xsf')
train_data.set_data()energy_test_gpr = np.load("./energy_test_gpr.npy")
energy_test_reference = np.load("./energy_test_reference.npy")
force_test_gpr = np.load("./force_test_gpr.npy")
force_test_reference = np.load("./force_test_reference.npy")
unc_e_test_gpr = np.load("./unc_e_test_gpr.npy")
unc_f_test_gpr = np.load("./unc_f_test_gpr.npy")n_test = 200
d_test = np.linspace(0.95, 2.05, n_test) # H-H bond distance
n_train = 7
d_train = np.linspace(1.0, 2.0, n_train) # H-H bond distancefig = plt.figure(figsize=(3.5, 4.5))
ax2, ax3 = fig.subplots(2, 1, height_ratios=[1, 1.5], sharex=False, sharey=False)
font_x = {'size': 16, 'color': 'black'}
font_y = {'size': 16, 'color': 'black'}
font_tick = {'size': 12, 'color': 'black'}
# The second plot
ax2.fill_between(d_test, np.subtract(energy_test_gpr, energy_test_reference), 0, color='crimson', alpha=0.5, edgecolor='black')
ax2.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax2.set_xlim([0.95, 2.05])
x_labels = [round(label, 2) for label in ax2.get_xticks()]
ax2.set_xticks(x_labels)
ax2.set_xlim([0.95, 2.05])
ax2.set_xticklabels(x_labels, fontdict=font_tick)
ax2.set_ylabel("Error (eV)", fontdict=font_y)
ax2.set_ylim([-0.05, 0.05])
y_labels = [round(label, 2) for label in ax2.get_yticks()]
y_labels = [-0.04, -0.02, 0.0, 0.02, 0.04]
ax2.set_yticks(y_labels)
ax2.set_ylim([-0.05, 0.05])
ax2.set_yticklabels(y_labels, fontdict=font_tick)
[x.set_linewidth(1.5) for x in ax2.spines.values()]
ax2.tick_params(bottom=True, top=True, left=True, right=True)
ax2.tick_params(labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax2.tick_params(direction='in', length=8, width=1.5)
# The third plot
ax3.plot(d_test, energy_test_reference, '-', label='Target', color='gray', alpha=0.5, linewidth=10)
ax3.plot(d_test, energy_test_gpr, '-', label='GPR', color='crimson', linewidth=4)
ax3.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax3.plot(d_train, train_data.energy, linestyle='', marker='o', markersize=10, color='black', label='Samples')
ax3.set_xlabel('H-H distance (Ang)', fontdict=font_x)
ax3.set_xlim(0.95, 2.05)
x_labels = [round(label, 2) for label in ax3.get_xticks()]
ax3.set_xticks(x_labels)
ax3.set_xticklabels(x_labels, fontdict=font_tick)
ax3.set_xlim(0.95, 2.05)
ax3.set_ylabel('Energy (eV)', fontdict=font_y)
ax3.set_ylim(-1.4, 2.2)
y_labels = [round(label, 2) for label in ax3.get_yticks()]
# y_labels = [-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0]
ax3.set_yticks(y_labels)
ax3.set_yticklabels(y_labels, fontdict=font_tick)
ax3.set_ylim(-1.4, 2.2)
ax3.legend(loc='upper right', fontsize=12, ncol=1, frameon=True)
[x.set_linewidth(1.5) for x in ax3.spines.values()]
ax3.tick_params(bottom=True, top=True, left=True, right=False)
ax3.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)
ax3.tick_params(direction='in', length=8, width=1.5)
fig.tight_layout()
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.show()
fig = plt.figure(figsize=(3.5, 4.5))
ax2, ax3 = fig.subplots(2, 1, height_ratios=[1, 1.5], sharex=False, sharey=False)
font_x = {'size': 16, 'color': 'black'}
font_y = {'size': 16, 'color': 'black'}
font_tick = {'size': 12, 'color': 'black'}
# The second plot
ax2.fill_between(d_test, np.subtract(force_test_gpr[:, 0, 0], force_test_reference[:, 0, 0]), 0, color='crimson', alpha=0.5, edgecolor='black')
ax2.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax2.set_xlim([0.95, 2.05])
x_labels = [round(label, 2) for label in ax2.get_xticks()]
ax2.set_xticks(x_labels)
ax2.set_xlim([0.95, 2.05])
ax2.set_xticklabels(x_labels, fontdict=font_tick)
ax2.set_ylabel("Error (eV/Ang)", fontdict=font_y)
ax2.set_ylim([-1.2, 1.2])
y_labels = [round(label, 2) for label in ax2.get_yticks()]
y_labels = [-1.0, -0.5, 0.0, 0.5, 1.0]
ax2.set_yticks(y_labels)
ax2.set_ylim([-1.2, 1.2])
ax2.set_yticklabels(y_labels, fontdict=font_tick)
[x.set_linewidth(1.5) for x in ax2.spines.values()]
ax2.tick_params(bottom=True, top=True, left=True, right=True)
ax2.tick_params(labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax2.tick_params(direction='in', length=8, width=1.5)
# The third plot
ax3.plot(d_test, force_test_reference[:, 0, 0], '-', label='Target', color='gray', alpha=0.5, linewidth=10)
ax3.plot(d_test, force_test_gpr[:, 0, 0], '-', label='GPR', color='crimson', linewidth=4)
ax3.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax3.plot(d_train, train_data.force[:, 0, 0], linestyle='', marker='o', markersize=10, color='black', label='Samples')
ax3.set_xlabel('H-H distance (Ang)', fontdict=font_x)
ax3.set_xlim(0.95, 2.05)
x_labels = [round(label, 2) for label in ax3.get_xticks()]
ax3.set_xticks(x_labels)
ax3.set_xticklabels(x_labels, fontdict=font_tick)
ax3.set_xlim(0.95, 2.05)
ax3.set_ylabel('Force (eV/Ang)', fontdict=font_y)
ax3.set_ylim(-40, 5)
y_labels = [int(label) for label in ax3.get_yticks()]
# y_labels = [-40, -30, -20, -10, 0]
ax3.set_yticks(y_labels)
ax3.set_yticklabels(y_labels, fontdict=font_tick)
ax3.set_ylim(-40, 5)
# ax3.legend(loc='lower right', fontsize=12, ncol=1, frameon=True)
[x.set_linewidth(1.5) for x in ax3.spines.values()]
ax3.tick_params(bottom=True, top=True, left=True, right=False)
ax3.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)
ax3.tick_params(direction='in', length=8, width=1.5)
fig.tight_layout()
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.show()
3. Uncertainty vs Error¶
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
# --- Energy error vs uncertainty ---
energy_error = np.abs(energy_test_gpr - energy_test_reference)
unc_e = unc_e_test_gpr
r_energy, _ = pearsonr(energy_error, unc_e)
plt.figure(figsize=(6, 5))
plt.scatter(unc_e, energy_error, alpha=0.6, edgecolors='k')
plt.xlabel("Energy uncertainty (eV)")
plt.ylabel("Energy absolute error (eV)")
plt.title(f"Energy Error vs Uncertainty\nPearson r = {r_energy:.3f}")
plt.grid(True)
plt.tight_layout()
plt.show()
# --- Force error vs uncertainty ---
force_error = np.abs(force_test_gpr[10:, 0, 0] - force_test_reference[10:, 0, 0])
unc_f = unc_f_test_gpr[10:, 0, 0]
r_force, _ = pearsonr(force_error, unc_f)
plt.figure(figsize=(6, 5))
plt.scatter(unc_f, force_error, alpha=0.6, edgecolors='k')
plt.xlabel("Force uncertainty (eV/Å)")
plt.ylabel("Force absolute error (eV/Å)")
plt.title(f"Force Error vs Uncertainty\nPearson r = {r_force:.3f}")
plt.grid(True)
plt.tight_layout()
plt.show()