try:
    import torch
    print("successfully imported torch")
    print(torch.__version__)
except ImportError:
    !! pip install torch torchvision torchaudio --user --index-url https://download.pytorch.org/whl/cpu
    print("completed installing torch")

successfully imported torch
2.6.0+cu124

try:
    import numpy as np
    print("successfully imported numpy")
    print(np.__version__)
except ImportError:
    !! pip install numpy --user --upgrade
    print("completed installing numpy")

successfully imported numpy
1.26.4

try:
    from matplotlib import pyplot as plt
    print("successfully imported matplotlib")
except ImportError:
    !! pip install matplotlib --user --upgrade
    print("completed installing matplotlib")

successfully imported matplotlib

!! pip install aenet-gpr --user --upgrade
print("completed installing aenet-gpr")

! pip show aenet-gpr

completed installing aenet-gpr
Name: aenet-gpr
Version: 2.6.5
Summary: Atomistic simulation tools based on Gaussian Processes Regression
Home-page: https://github.com/atomisticnet/aenet-gpr
Author: In Won Yeu
Author-email: iy2185@columbia.edu
License: MPL-2.0
Location: /data/home/iy2185/.local/lib/python3.12/site-packages
Requires: ase
Required-by:

import os, sys, site, glob
sys.path.append(site.USER_SITE)

import aenet_gpr
print(aenet_gpr.__version__)

2.6.5

from IPython.display import Image

try:
    import ase.io
    print("successfully imported ase")
except ImportError:
    !! pip install ase --user --upgrade
    print("completed installing ASE")

    import ase.io
    print("successfully imported ase")

successfully imported ase

1. Prepare training/test data¶

We are using train/test data of H-H Lennard-Jones potential prepared in aenet-gpr/example/1_H2/

import os.path

! rm -rf 1_H2
! mkdir 1_H2

if os.path.isfile("../example/1_H2/train_set.zip"):
    ! unzip -oq ../example/1_H2/train_set.zip -d ./1_H2/
    print("number of train data:")
    ! find ./1_H2/train_set/ -type f | wc -l
else:
    ! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train_set.zip
    ! unzip -oq train_set.zip -d ./1_H2/
    ! rm train_set.zip
    print("number of train data:")
    ! find ./1_H2/train_set/ -type f | wc -l

--2025-10-19 16:54:22--  https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train_set.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train_set.zip [following]
--2025-10-19 16:54:22--  https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train_set.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4676 (4.6K) [application/zip]
Saving to: ‘train_set.zip’

train_set.zip       100%[===================>]   4.57K  --.-KB/s    in 0s      

2025-10-19 16:54:22 (19.0 MB/s) - ‘train_set.zip’ saved [4676/4676]

number of train data:
7

if os.path.isfile("../example/1_H2/test_set.zip"):
    ! unzip -oq ../example/1_H2/test_set.zip -d ./1_H2/
    print("number of test data:")
    ! find ./1_H2/test_set/ -type f | wc -l
else:
    ! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/test_set.zip
    ! unzip -oq test_set.zip -d ./1_H2/
    ! rm test_set.zip
    print("number of test data:")
    ! find ./1_H2/test_set/ -type f | wc -l

--2025-10-19 16:54:23--  https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/test_set.zip
Resolving github.com (github.com)... 140.82.112.4
Connecting to github.com (github.com)|140.82.112.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/test_set.zip [following]
--2025-10-19 16:54:23--  https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/test_set.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 122489 (120K) [application/zip]
Saving to: ‘test_set.zip’

test_set.zip        100%[===================>] 119.62K  --.-KB/s    in 0.003s  

2025-10-19 16:54:23 (42.2 MB/s) - ‘test_set.zip’ saved [122489/122489]

number of test data:
200

atoms = ase.io.read("./1_H2/test_set/file_0000.xsf")
ase.io.write('H2.png', atoms)
Image("H2.png")

2-1. Train–Test with default Cartesian coordinate fingerprint¶

In addition to the reference data files, following configuration file train.in is all you need to run aenet-gpr.

Most of the contents are set to default parameters, which can also be deleted.

! rm -f train.in

if os.path.isfile("../example/1_H2/train.in"):
    ! cp ../example/1_H2/train.in .
else:
    ! wget https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train.in

--2025-10-19 16:54:26--  https://github.com/atomisticnet/aenet-gpr/raw/refs/heads/main/example/1_H2/train.in
Resolving github.com (github.com)... 140.82.113.4
Connecting to github.com (github.com)|140.82.113.4|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train.in [following]
--2025-10-19 16:54:26--  https://raw.githubusercontent.com/atomisticnet/aenet-gpr/refs/heads/main/example/1_H2/train.in
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 292 [text/plain]
Saving to: ‘train.in’

train.in            100%[===================>]     292  --.-KB/s    in 0s      

2025-10-19 16:54:26 (5.52 MB/s) - ‘train.in’ saved [292/292]

! cat train.in

# File path
Train_file ./1_H2/train_set/file_*.xsf
Test_file ./1_H2/test_set/file_*.xsf

# File format (default: xsf)
File_format xsf

# Descriptor (default: cartesian coordinates)
Descriptor cart

# Data filter (remove close data, default: True)
Filter False

fit_weight True
fit_scale True

! python -m aenet_gpr train.in

======================================================================
aenet-GPR: surrogate GPR for GPR-ANN indirect force training
======================================================================

2025-10-19 16:54:30.

Developed by In Won Yeu

This program performs three main steps:
1. Train: Generates a GPR model using the provided structure, energy, and force data.
2. Test: Uses the generated GPR model to predict values for the test set structures.
3. Augmentation: Performs data augmentation in xsf file format, compatible with aenet-(PyTorch),
supporting a GPR-ANN training in conjunction with aenet-(PyTorch).

Each of these steps is executed once the input file (train.in) contains the keywords:
Train_file [train file path]
Test_file [test file path]
Additional_write [True]


======================================================================
Train
======================================================================

Read reference training data

Time needed for reading data:            0.024267 s
Maximum CPU memory used:             0.408844 GB
Maximum GPU memory used:             0.000000 GB
Energy data size: (7,)  # (N_data, )
Force data size: (7, 2, 3)  # (N_data, N_atom, 3)
----------------------------------------------------------------------
Candidate scales: tensor([0.2000, 0.4000, 0.8000])
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 35 is not positive-definite).
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 23 is not positive-definite).
linalg.cholesky: The factorization could not be completed because the input is not positive-definite (the leading minor of order 17 is not positive-definite).
Updated scale: 0.4
Weight parameter is too high (1755.4094971801248).
Fix the weight parameter
Updated weight: 1.0
----------------------------------------------------------------------
Model train

Training parameters
{'num_data': 7, 'calculator': {'kerneltype': 'sqexp', 'scale': tensor(0.4000, dtype=torch.float64), 'weight': tensor(1., dtype=torch.float64), 'noise': tensor(1.0000e-06, dtype=torch.float64), 'noisefactor': tensor(0.5000, dtype=torch.float64), 'prior': tensor(1747.2449, dtype=torch.float64)}, 'fix_ind': None, 'pbc': False, 'species': ['H', 'H'], 'num_atom': 2}

Time needed for training:            0.448779 s
Maximum CPU memory used:             0.426407 GB
Maximum GPU memory used:             0.000000 GB
----------------------------------------------------------------------
======================================================================
Test
======================================================================
----------------------------------------------------------------------
Model evaluation for test set
Time needed for test evaluation:            1.174569 s
Maximum CPU memory used:             0.431431 GB
Maximum GPU memory used:             0.000000 GB
----------------------------------------------------------------------
GPR energy MAE (eV): 0.002753264542017503
GPR force MAE (eV/Ang): 0.10276022081180339

Saving test target to [energy_test_reference.npy] and [force_test_reference.npy]
Saving GPR prediction to [energy_test_gpr.npy], [force_test_gpr.npy], [unc_e_test_gpr.npy], and [unc_f_test_gpr.npy]

2-2. Visualize the results¶

from aenet_gpr.util import ReferenceData

train_xsf_files = glob.glob("./1_H2/train_set/file_*")
train_xsf_files.sort()

train_data = ReferenceData(structure_files=train_xsf_files, file_format='xsf')
train_data.set_data()

energy_test_gpr = np.load("./energy_test_gpr.npy")
energy_test_reference = np.load("./energy_test_reference.npy")

force_test_gpr = np.load("./force_test_gpr.npy")
force_test_reference = np.load("./force_test_reference.npy")

unc_e_test_gpr = np.load("./unc_e_test_gpr.npy")
unc_f_test_gpr = np.load("./unc_f_test_gpr.npy")

n_test = 200
d_test = np.linspace(0.95, 2.05, n_test)  # H-H bond distance

n_train = 7
d_train = np.linspace(1.0, 2.0, n_train)  # H-H bond distance

fig = plt.figure(figsize=(3.5, 4.5))
ax2, ax3 = fig.subplots(2, 1, height_ratios=[1, 1.5], sharex=False, sharey=False)

font_x = {'size': 16, 'color': 'black'}
font_y = {'size': 16, 'color': 'black'}
font_tick = {'size': 12, 'color': 'black'}


# The second plot
ax2.fill_between(d_test, np.subtract(energy_test_gpr, energy_test_reference), 0, color='crimson', alpha=0.5, edgecolor='black')
ax2.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)

ax2.set_xlim([0.95, 2.05])
x_labels = [round(label, 2) for label in ax2.get_xticks()]
ax2.set_xticks(x_labels)
ax2.set_xlim([0.95, 2.05])
ax2.set_xticklabels(x_labels, fontdict=font_tick)

ax2.set_ylabel("Error (eV)", fontdict=font_y)
ax2.set_ylim([-0.05, 0.05])
y_labels = [round(label, 2) for label in ax2.get_yticks()]
y_labels = [-0.04, -0.02, 0.0, 0.02, 0.04]
ax2.set_yticks(y_labels)
ax2.set_ylim([-0.05, 0.05])
ax2.set_yticklabels(y_labels, fontdict=font_tick)

[x.set_linewidth(1.5) for x in ax2.spines.values()]
ax2.tick_params(bottom=True, top=True, left=True, right=True)
ax2.tick_params(labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax2.tick_params(direction='in', length=8, width=1.5)


# The third plot
ax3.plot(d_test, energy_test_reference, '-', label='Target', color='gray', alpha=0.5, linewidth=10)
ax3.plot(d_test, energy_test_gpr, '-', label='GPR', color='crimson', linewidth=4)
ax3.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax3.plot(d_train, train_data.energy, linestyle='', marker='o', markersize=10, color='black', label='Samples')

ax3.set_xlabel('H-H distance (Ang)', fontdict=font_x)
ax3.set_xlim(0.95, 2.05)
x_labels = [round(label, 2) for label in ax3.get_xticks()]
ax3.set_xticks(x_labels)
ax3.set_xticklabels(x_labels, fontdict=font_tick)
ax3.set_xlim(0.95, 2.05)

ax3.set_ylabel('Energy (eV)', fontdict=font_y)
ax3.set_ylim(-1.4, 2.2)
y_labels = [round(label, 2) for label in ax3.get_yticks()]
# y_labels = [-1.0, -0.5, 0.0, 0.5, 1.0, 1.5, 2.0]
ax3.set_yticks(y_labels)
ax3.set_yticklabels(y_labels, fontdict=font_tick)
ax3.set_ylim(-1.4, 2.2)

ax3.legend(loc='upper right', fontsize=12, ncol=1, frameon=True)
[x.set_linewidth(1.5) for x in ax3.spines.values()]
ax3.tick_params(bottom=True, top=True, left=True, right=False)
ax3.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)
ax3.tick_params(direction='in', length=8, width=1.5)

fig.tight_layout()
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.show()

fig = plt.figure(figsize=(3.5, 4.5))
ax2, ax3 = fig.subplots(2, 1, height_ratios=[1, 1.5], sharex=False, sharey=False)

font_x = {'size': 16, 'color': 'black'}
font_y = {'size': 16, 'color': 'black'}
font_tick = {'size': 12, 'color': 'black'}


# The second plot
ax2.fill_between(d_test, np.subtract(force_test_gpr[:, 0, 0], force_test_reference[:, 0, 0]), 0, color='crimson', alpha=0.5, edgecolor='black')
ax2.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)

ax2.set_xlim([0.95, 2.05])
x_labels = [round(label, 2) for label in ax2.get_xticks()]
ax2.set_xticks(x_labels)
ax2.set_xlim([0.95, 2.05])
ax2.set_xticklabels(x_labels, fontdict=font_tick)

ax2.set_ylabel("Error (eV/Ang)", fontdict=font_y)
ax2.set_ylim([-1.2, 1.2])
y_labels = [round(label, 2) for label in ax2.get_yticks()]
y_labels = [-1.0, -0.5, 0.0, 0.5, 1.0]
ax2.set_yticks(y_labels)
ax2.set_ylim([-1.2, 1.2])
ax2.set_yticklabels(y_labels, fontdict=font_tick)

[x.set_linewidth(1.5) for x in ax2.spines.values()]
ax2.tick_params(bottom=True, top=True, left=True, right=True)
ax2.tick_params(labelbottom=False, labeltop=False, labelleft=True, labelright=False)
ax2.tick_params(direction='in', length=8, width=1.5)


# The third plot
ax3.plot(d_test, force_test_reference[:, 0, 0], '-', label='Target', color='gray', alpha=0.5, linewidth=10)
ax3.plot(d_test, force_test_gpr[:, 0, 0], '-', label='GPR', color='crimson', linewidth=4)
ax3.vlines(d_train, ymin=-100, ymax=100, color='black', linestyle='--', linewidth=1)
ax3.plot(d_train, train_data.force[:, 0, 0], linestyle='', marker='o', markersize=10, color='black', label='Samples')

ax3.set_xlabel('H-H distance (Ang)', fontdict=font_x)
ax3.set_xlim(0.95, 2.05)
x_labels = [round(label, 2) for label in ax3.get_xticks()]
ax3.set_xticks(x_labels)
ax3.set_xticklabels(x_labels, fontdict=font_tick)
ax3.set_xlim(0.95, 2.05)

ax3.set_ylabel('Force (eV/Ang)', fontdict=font_y)
ax3.set_ylim(-40, 5)
y_labels = [int(label) for label in ax3.get_yticks()]
# y_labels = [-40, -30, -20, -10, 0]
ax3.set_yticks(y_labels)
ax3.set_yticklabels(y_labels, fontdict=font_tick)
ax3.set_ylim(-40, 5)

# ax3.legend(loc='lower right', fontsize=12, ncol=1, frameon=True)
[x.set_linewidth(1.5) for x in ax3.spines.values()]
ax3.tick_params(bottom=True, top=True, left=True, right=False)
ax3.tick_params(labelbottom=True, labeltop=False, labelleft=True, labelright=False)
ax3.tick_params(direction='in', length=8, width=1.5)

fig.tight_layout()
plt.subplots_adjust(hspace=0.05, wspace=0.05)
plt.show()

3. Uncertainty vs Error¶

import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import pearsonr

# --- Energy error vs uncertainty ---
energy_error = np.abs(energy_test_gpr - energy_test_reference)
unc_e = unc_e_test_gpr

r_energy, _ = pearsonr(energy_error, unc_e)

plt.figure(figsize=(6, 5))
plt.scatter(unc_e, energy_error, alpha=0.6, edgecolors='k')
plt.xlabel("Energy uncertainty (eV)")
plt.ylabel("Energy absolute error (eV)")
plt.title(f"Energy Error vs Uncertainty\nPearson r = {r_energy:.3f}")
plt.grid(True)
plt.tight_layout()
plt.show()

# --- Force error vs uncertainty ---
force_error = np.abs(force_test_gpr[10:, 0, 0] - force_test_reference[10:, 0, 0])
unc_f = unc_f_test_gpr[10:, 0, 0]

r_force, _ = pearsonr(force_error, unc_f)

plt.figure(figsize=(6, 5))
plt.scatter(unc_f, force_error, alpha=0.6, edgecolors='k')
plt.xlabel("Force uncertainty (eV/Å)")
plt.ylabel("Force absolute error (eV/Å)")
plt.title(f"Force Error vs Uncertainty\nPearson r = {r_force:.3f}")
plt.grid(True)
plt.tight_layout()
plt.show()