Source code for reforge.io

"""File: io.py

Description:
    This module provides I/O utilities for the reForge workflow, including functions 
    for reading and saving various data formats (e.g., CSV, NPY, XVG) as well as parsing 
    PDB files into domain-specific objects. Additionally, it offers helper functions for 
    filtering file names and recursively retrieving file paths from directories.

Usage Example:
    >>> from io import read_positions, pdb2system, npy2csv
    >>> import numpy as np
    >>> # Read positions from an MDAnalysis Universe object 'u'
    >>> positions = read_positions(u, ag, time_range=(0, 500), sample_rate=1)
    >>> # Save a NumPy array to CSV format
    >>> data = np.random.rand(100, 10)
    >>> npy2csv(data, 'output.csv')
    >>> # Parse a PDB file into a System object
    >>> system = pdb2system('structure.pdb')

Requirements:
    - Python 3.x
    - NumPy
    - Pandas
    - pathlib
    - reForge utilities (timeit, memprofit, logger)
    - reForge pdbtools (AtomList, System, PDBParser)

Author: Your Name
Date: YYYY-MM-DD
"""

from pathlib import Path
import numpy as np
import pandas as pd
from reforge.utils import timeit, memprofit, logger
from reforge.pdbtools import AtomList, System, PDBParser

################################################################################
## Reading Trajectories with MDAnalysis
################################################################################

[docs] @timeit @memprofit def read_positions(u, ag, b=0, e=10000000, sample_rate=1, dtype=np.float32): """Extract and return positions from an MDAnalysis trajectory. This function reads the positions for a specified atom group from the trajectory stored in an MDAnalysis Universe. It extracts frames starting from index `b` up to index `e`, sampling every `sample_rate` frames, and returns the coordinates in a flattened, contiguous 2D array. Parameters ---------- u : MDAnalysis.Universe The MDAnalysis Universe containing the trajectory. ag : MDAnalysis.AtomGroup The atom group from which to extract positions. b : int, optional The starting frame index (default is 0). e : int, optional The ending frame index (default is 10000000). sample_rate : int, optional The sampling rate for frames (default is 1, meaning every frame is used). dtype : data-type, optional The data type for the returned array (default is np.float32). Returns ------- np.ndarray A contiguous 2D array with shape (n_coords, n_frames) containing flattened position coordinates. """ logger.info("Reading positions...") arr = np.array( [ag.positions.flatten() for ts in u.trajectory[::sample_rate] if b < ts.time < e], dtype=dtype, ) arr = np.ascontiguousarray(arr.T) logger.info("Done!") return arr
[docs] @timeit @memprofit def read_velocities(u, ag, b=0, e=10000000, sample_rate=1, dtype=np.float32): """Saimilar to the previous. Read and return velocities from an MDAnalysis trajectory.""" logger.info("Reading velocities...") arr = np.array( [ag.velocities.flatten() for ts in u.trajectory[::sample_rate] if b < ts.time < e], dtype=dtype, ) arr = np.ascontiguousarray(arr.T) logger.info("Done!") return arr
[docs] def parse_covar_dat(file, dtype=np.float32): """Parse a GROMACS covar.dat file into a covariance matrix. Parameters ---------- file : str Path to the covar.dat file. dtype : data-type, optional The data type for the covariance matrix (default is np.float32). Returns ------- np.ndarray A reshaped 2D covariance matrix of shape (3*resnum, 3*resnum), where resnum is inferred from the file. """ df = pd.read_csv(file, sep="\\s+", header=None) covariance_matrix = np.asarray(df, dtype=dtype) resnum = int(np.sqrt(len(covariance_matrix) / 3)) covariance_matrix = np.reshape(covariance_matrix, (3 * resnum, 3 * resnum)) return covariance_matrix
################################################################################ ## File Filtering and Retrieval Functions ################################################################################
[docs] def fname_filter(f, sw="", cont="", ew=""): """Check if a file name matches the specified start, contained, and end patterns. Parameters ---------- f : str The file name to check. sw : str, optional Required starting substring (default is an empty string). cont : str, optional Required substring to be contained in the name (default is an empty string). ew : str, optional Required ending substring (default is an empty string). Returns ------- bool True if the file name satisfies all specified conditions; otherwise, False. """ return f.startswith(sw) and cont in f and f.endswith(ew)
[docs] def filter_files(fpaths, sw="", cont="", ew=""): """Filter a list of file paths based on name patterns. Parameters ---------- fpaths : list[Path] A list of pathlib.Path objects representing file paths. sw : str, optional Required starting substring (default is an empty string). cont : str, optional Required substring to be contained (default is an empty string). ew : str, optional Required ending substring (default is an empty string). Returns ------- list[Path] A list of Path objects that match the specified filters. """ return [f for f in fpaths if fname_filter(f.name, sw=sw, cont=cont, ew=ew)]
[docs] def pull_files(directory, pattern): r"""Recursively search for files in a directory matching a given pattern. Parameters ---------- directory : str or Path The root directory to search. pattern : str The glob pattern to match files (e.g., \*.txt). Returns ------- list[str] A list of absolute file paths (as strings) that match the pattern. Raises ------ FileNotFoundError If the specified directory does not exist or is not a directory. """ base_path = Path(directory) if not base_path.exists() or not base_path.is_dir(): raise FileNotFoundError( f"Directory '{directory}' does not exist or is not a directory." ) return [str(p) for p in base_path.rglob(pattern)]
[docs] def pull_all_files(directory): """Recursively retrieve all files in the specified directory and its subdirectories. Parameters ---------- directory : str or Path The directory to search. Returns ------- list[str] A list of absolute file paths for all files found. """ return pull_files(directory, pattern="*")
################################################################################ ## Data Conversion and I/O Functions ################################################################################
[docs] def xvg2npy(xvg_path, npy_path, usecols=(0, 1)): """Convert a GROMACS XVG file to a NumPy binary file (.npy). Parameters ---------- xvg_path : str Path to the input XVG file. npy_path : str Path where the output .npy file will be saved. usecols : list of int, optional Column indices to read from the XVG file (default is [0, 1]). Returns ------- None """ try: df = pd.read_csv(xvg_path, sep="\\s+", header=None, usecols=usecols) except Exception as exc: raise ValueError("Error reading XVG file") from exc data = np.squeeze(df.to_numpy().T) np.save(npy_path, data)
[docs] def pdb2system(pdb_path) -> System: """Parse a PDB file and return a System object. Parameters ---------- pdb_path : str Path to the PDB file. Returns ------- System A System object representing the parsed PDB structure. """ parser = PDBParser(pdb_path) return parser.parse()
[docs] def pdb2atomlist(pdb_path) -> AtomList: """Parse a PDB file and return an AtomList object. Parameters ---------- pdb_path : str Path to the PDB file. Returns ------- AtomList An AtomList object containing the atoms from the PDB file. """ atoms = AtomList() atoms.read_pdb(pdb_path) return atoms
[docs] def read_data(fpath): """Read data from a file (.csv, .npy, .dat, or .xvg) and return it as a NumPy array. Parameters ---------- fpath : str Path to the data file. Returns ------- np.ndarray The data loaded from the file. Raises ------ ValueError If the file cannot be read properly or the data does not meet expected criteria. """ ftype = Path(fpath).suffix[1:] if ftype == "npy": try: data = np.load(fpath) except Exception as exc: raise ValueError("Error loading npy file") from exc elif ftype in {"csv", "dat"}: try: df = pd.read_csv(fpath, sep="\\s+", header=None) data = np.squeeze(df.values) if data.shape[0] != 1104: raise ValueError("Data shape mismatch for csv/dat file") except Exception as exc: raise ValueError("Error reading csv/dat file") from exc elif ftype == "xvg": try: df = pd.read_csv(fpath, sep="\\s+", header=None, usecols=[1]) data = np.squeeze(df.values) if data.shape[0] > 10000: raise ValueError("Data shape too large for xvg file") except Exception as exc: raise ValueError("Error reading xvg file") from exc else: raise ValueError("Unsupported file type") return data
[docs] def read_xvg(fpath, usecols=(0, 1)): """Read a GROMACS XVG file and return its contents as a Pandas DataFrame. Parameters ---------- fpath : str Path to the XVG file. usecols : list of int, optional Column indices to read from the file (default is [0, 1]). Returns ------- pd.DataFrame A DataFrame containing the selected columns from the XVG file. Raises ------ ValueError If the file cannot be read. """ try: df = pd.read_csv(fpath, sep="\\s+", header=None, usecols=usecols) except Exception as exc: raise ValueError("Error reading xvg file") from exc return df
[docs] def npy2csv(data, fpath): """Save a NumPy array to a file in either .csv or .npy format. Parameters ---------- data : np.ndarray The data to be saved. fpath : str Path to the output file. The file extension determines the format (.csv or .npy). Returns ------- None """ ftype = Path(fpath).suffix[1:] if ftype == "csv": df = pd.DataFrame(data) df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=",")
[docs] def save_1d_data(data, ids=None, fpath="dfi.xvg", sep=" "): """Save one-dimensional data in GROMACS XVG format. Parameters ---------- data : list or np.ndarray The y-column data to be saved. ids : list or np.ndarray, optional The x-column data (e.g., indices). If not provided, defaults to a range starting from 1. fpath : str, optional Path to the output file (default is 'dfi.xvg'). sep : str, optional Field separator in the output file (default is a single space). Returns ------- None """ if ids is None: ids = np.arange(1, len(data) + 1).astype(int) df = pd.DataFrame({"ids": ids, "data": data}) df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=sep)
[docs] def save_2d_data(data, fpath="dfi.xvg", sep=" "): """Save two-dimensional data in GROMACS XVG format. Parameters ---------- data : list or np.ndarray The 2D data to be saved. ids : list, optional Optional identifiers (unused in this function; provided for interface consistency). fpath : str, optional Path to the output file (default is 'dfi.xvg'). sep : str, optional Field separator in the output file (default is a single space). Returns ------- None """ df = pd.DataFrame(data) df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=sep)