import matplotlib.pyplot as plt
import numpy as np
import sys
import pint
import znh5md
import zntrack
import logging
import warnings
import datetime
import rdkit2ase
import IPython.display
import tqdm.auto as tqdm

# enable import statements on local Python scripts
sys.path.insert(0, ".")

# Initialize unit registry for proper unit handling
ureg = pint.UnitRegistry()


!dvc pull


# Display the APAX configuration file
with open("config/apax.yaml", "r") as f:
    content = f.read()
IPython.display.display(IPython.display.Markdown(f"```yaml\n{content}\n```"))


import workflow

with warnings.catch_warnings(action="ignore"):
    logging.getLogger("zntrack.project").setLevel(logging.WARNING)
    project = workflow.Workflow()
    project.configure()
    project.train()
    logging.getLogger("zntrack.project").setLevel(logging.NOTSET)


# Load a pre-trained model and get its ASE calculator interface
with warnings.catch_warnings(action="ignore"):
    model = zntrack.from_rev("r_max_5_5_Apax")
    calc = model.get_calculator()
print(calc)


# Load the test dataset (stored in H5MD format)
test_io = znh5md.IO("data/cosmo_water_test.h5")
test_frames = test_io[:]


# SOLUTION CELL
print(f"Number of test frames: {len(test_frames)}")
print(f"Number of atoms per frame: {np.mean([len(frame) for frame in test_frames])}")

fig, ax = plt.subplots(1, 2, figsize=(14, 4))
ax[0].hist([frame.get_potential_energy() / 1000. for frame in test_frames], bins=20)
ax[0].set_xlabel("Potential energy / keV")
ax[0].set_ylabel("Counts")

ax[1].hist(
    np.reshape([frame.get_forces() for frame in test_frames], -1),
    bins=200,
    density=True,
)
ax[1].set_xlabel("Force component / eV/Å")
ax[1].set_ylabel("Density")
plt.show()


# Extract reference (true) values from the test dataset
true_energies = [x.get_potential_energy() for x in test_frames]
true_forces = [x.get_forces() for x in test_frames]

# Compute MLIP predictions for all test configurations
pred_energies = []
pred_forces = []
for frame in tqdm.tqdm(test_frames, desc="Computing MLIP predictions"):
    frame.calc = calc  # Assign the MLIP calculator
    pred_energies.append(frame.get_potential_energy())
    pred_forces.append(frame.get_forces())


# SOLUTION CELL
fig, ax = plt.subplots(1, 2, figsize=(12, 5))

# Helper function to compute Mean Absolute Error
def mae(y_true, y_pred):
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

# Energy correlation plot (normalized per atom)
atoms = np.array([len(frame) for frame in test_frames])
true_energies_norm = np.array(true_energies) / atoms
pred_energies_norm = np.array(pred_energies) / atoms
e_mae = mae(true_energies_norm, pred_energies_norm)

ax[0].scatter(
    true_energies_norm,
    pred_energies_norm,
    marker="x",
    alpha=0.7,
    label=f"MAE: {e_mae:.4f} eV/atom",
)
ax[0].plot(
    [true_energies_norm.min(), true_energies_norm.max()],
    [true_energies_norm.min(), true_energies_norm.max()],
    "k--",
    alpha=0.5,
)
ax[0].set_xlabel("True Energies / eV/atom")
ax[0].set_ylabel("Predicted Energies / eV/atom")
ax[0].set_title("Energy Predictions")
ax[0].legend()

# Force correlation plot
true_forces_flat = np.concatenate([f.flatten() for f in true_forces])
pred_forces_flat = np.concatenate([f.flatten() for f in pred_forces])
f_mae = mae(true_forces_flat, pred_forces_flat)

ax[1].scatter(
    true_forces_flat,
    pred_forces_flat,
    marker="x",
    alpha=0.3,
    label=f"MAE: {f_mae:.4f} eV/Å",
)
ax[1].plot(
    [true_forces_flat.min(), true_forces_flat.max()],
    [true_forces_flat.min(), true_forces_flat.max()],
    "k--",
    alpha=0.5,
)
ax[1].set_xlabel(r"True Forces / eV/Å")
ax[1].set_ylabel(r"Predicted Forces / eV/Å")
ax[1].set_title("Force Predictions")
ax[1].legend()

plt.tight_layout()
plt.show()


# SOLUTION CELL
model_names = [
    "nn_16-16_Apax",
    "nn_32-32_Apax",
    "nn_64-64_Apax",
    "nn_128-128_Apax",  # NN variations
    "r_max_2_Apax",
    "r_max_3_Apax",
    "r_max_4_Apax",
    "r_max_5_Apax",
    "r_max_5_5_Apax",
    "r_max_6_Apax",  # Cutoff variations
    "n_basis_4_Apax",
    "n_basis_8_Apax",
    "n_basis_16_Apax",  # Basis function variations
    "n_radial_5_Apax",
    "n_radial_6_Apax",
    "n_radial_7_Apax",  # Radial resolution variations
]

def mae(y_true, y_pred):
    """Calculate Mean Absolute Error"""
    return np.mean(np.abs(np.array(y_true) - np.array(y_pred)))

atoms = np.array([len(frame) for frame in test_frames])
results = {}

# Progress bar for model evaluation
pbar = tqdm.tqdm(model_names, desc="Evaluating models")

for name in pbar:
    pbar.set_description(f"Evaluating {name}")

    # Load model and get calculator
    with warnings.catch_warnings(action="ignore"):
        model = zntrack.from_rev(name)
        calc = model.get_calculator()

    # Measure evaluation time
    start = datetime.datetime.now()
    pred_frames = calc.batch_eval(test_frames, batch_size=1, silent=True)
    end = datetime.datetime.now()

    # Extract predictions and compute metrics
    pred_energies = [x.get_potential_energy() for x in pred_frames]
    pred_forces = [x.get_forces() for x in pred_frames]

    # Calculate normalized energy MAE and force MAE
    pred_energies_norm = np.array(pred_energies) / atoms
    e_mae = mae(true_energies_norm, pred_energies_norm)

    pred_forces_flat = np.concatenate([f.flatten() for f in pred_forces])
    f_mae = mae(true_forces_flat, pred_forces_flat)

    # Store results
    results[name] = (e_mae, f_mae, end - start)

# Display results in a formatted table
print("Model Performance Comparison")
print("=" * 70)
print(f"{'Model':<20} {'Energy MAE':<15} {'Force MAE':<15} {'Time / s':<10}")
print(f"{'':20} {'/ eV/atom':<15} {'/ eV/Å':<15} {'':10}")
print("-" * 70)

for name, (e_mae, f_mae, duration) in results.items():
    print(f"{name:<20} {e_mae:<15.4f} {f_mae:<15.4f} {duration.total_seconds():<10.2f}")

print("\nKey Observations:")
print("- Lower MAE values indicate better accuracy")
print("- Time reflects computational cost per evaluation")
print("- Best models balance accuracy and efficiency")


# Generate water molecule conformations from SMILES
water = rdkit2ase.smiles2conformers("O", numConfs=100)

print(f"Generated water molecule as ASE Atoms object: {water[0]}")
print(f"Chemical species: {water[0].get_chemical_symbols()}")
print(f"Atomic positions / Å:\n{water[0].get_positions()}")

# Visualize the molecular structure using RDKit
rdkit2ase.ase2rdkit(water[0])


# Create a liquid water box using PACKMOL
# Parameters: 100 water molecules at liquid density (997 kg/m³)
box = rdkit2ase.pack(
    [water],  # List of molecules to pack
    counts=[100],  # Number of each molecule type
    density=997,  # Density in kg/m³
    packmol="packmol",  # Path to PACKMOL executable, here assumed to be in PATH
)


# Assign the MLIP calculator to our water box
box.calc = calc


# Compute the potential energy of the entire system
energy = box.get_potential_energy() * ureg.eV
print(f"Potential energy of the water box: {energy:.2f~P}")


# Compute forces on all atoms
forces = box.get_forces() * ureg.eV / ureg.angstrom
print(f"Forces on atoms (first 5 shown): {forces[:5]:.2f~P}")

ensemble:
    kind: shallow
    n_members: 16


# Examine available results from the calculator
print("Available calculator results:")
print(list(box.calc.results.keys()))


# Extract and display prediction uncertainty
energy_uncertainty = box.calc.results["energy_uncertainty"] * ureg.eV
print(f"Energy prediction: {energy:.1f~P} ± {energy_uncertainty:.1f~P}")


# Save the water system for use in ESPResSo simulations
from ase.io import write

write("water.xyz", box)
print("Water system saved to water.xyz for ESPResSo integration in Part 3")

Machine Learned Interatomic Potentials (MLIP) with ESPResSo¶

Part 2: Focus on Machine Learned Interatomic Potentials¶

Overview¶

Note on Units and ASE¶

Python Libraries¶

Training Data¶

APAX Configuration¶

Model Hyperparameter Study¶

Key Hyperparameters¶

Training Workflow¶

Available Pre-trained Models¶

Neural Network Architecture Variations¶

Cutoff Radius Variations¶

Basis Function Variations¶

Radial Resolution Variations¶

Loading and Using Pre-trained Models¶

Model Evaluation on Test Dataset¶

Exercise 1: Model Performance Analysis¶

Exercise 2: Comparative Model Analysis¶

System Setup for Molecular Dynamics¶

SMILES Representation¶

Creating Bulk Systems with PACKMOL¶

Uncertainty Quantification with Ensemble Methods¶

Preparing for ESPResSo Integration¶

Key Learning Outcomes¶

Next Steps¶

References¶