Source code for reforge.io

"""File: io.py

Description:
    This module provides I/O utilities for the reForge workflow, including functions 
    for reading and saving various data formats (e.g., CSV, NPY, XVG) as well as parsing 
    PDB files into domain-specific objects. Additionally, it offers helper functions for 
    filtering file names and recursively retrieving file paths from directories.

Usage Example:
    >>> from io import read_positions, pdb2system, npy2csv
    >>> import numpy as np
    >>> # Read positions from an MDAnalysis Universe object 'u'
    >>> positions = read_positions(u, ag, time_range=(0, 500), sample_rate=1)
    >>> # Save a NumPy array to CSV format
    >>> data = np.random.rand(100, 10)
    >>> npy2csv(data, 'output.csv')
    >>> # Parse a PDB file into a System object
    >>> system = pdb2system('structure.pdb')

Requirements:
    - Python 3.x
    - NumPy
    - Pandas
    - pathlib
    - reForge utilities (timeit, memprofit, logger)
    - reForge pdbtools (AtomList, System, PDBParser)

Author: Your Name
Date: YYYY-MM-DD
"""

from pathlib import Path
import numpy as np
import pandas as pd
from reforge.utils import timeit, memprofit, logger
from reforge.pdbtools import AtomList, System, PDBParser

################################################################################
## Reading Trajectories with MDAnalysis
################################################################################


[docs]
@timeit
@memprofit
def read_positions(u, ag, b=0, e=10000000, sample_rate=1, dtype=np.float32):
    """Extract and return positions from an MDAnalysis trajectory.

    This function reads the positions for a specified atom group from the 
    trajectory stored in an MDAnalysis Universe. It extracts frames starting 
    from index `b` up to index `e`, sampling every `sample_rate` frames, and 
    returns the coordinates in a flattened, contiguous 2D array.

    Parameters
    ----------
    u : MDAnalysis.Universe
        The MDAnalysis Universe containing the trajectory.
    ag : MDAnalysis.AtomGroup
        The atom group from which to extract positions.
    b : int, optional
        The starting frame index (default is 0).
    e : int, optional
        The ending frame index (default is 10000000).
    sample_rate : int, optional
        The sampling rate for frames (default is 1, meaning every frame is used).
    dtype : data-type, optional
        The data type for the returned array (default is np.float32).

    Returns
    -------
    np.ndarray
        A contiguous 2D array with shape (n_coords, n_frames) containing flattened 
        position coordinates.
    """

    logger.info("Reading positions...")
    arr = np.array(
        [ag.positions.flatten() for ts in u.trajectory[::sample_rate] if b < ts.time < e],
        dtype=dtype,
    )
    arr = np.ascontiguousarray(arr.T)
    logger.info("Done!")
    return arr




[docs]
@timeit
@memprofit
def read_velocities(u, ag, b=0, e=10000000, sample_rate=1, dtype=np.float32):
    """Saimilar to the previous. Read and return velocities from an MDAnalysis trajectory."""
    logger.info("Reading velocities...")
    arr = np.array(
        [ag.velocities.flatten() for ts in u.trajectory[::sample_rate] if b < ts.time < e],
        dtype=dtype,
    )
    arr = np.ascontiguousarray(arr.T)
    logger.info("Done!")
    return arr




[docs]
def parse_covar_dat(file, dtype=np.float32):
    """Parse a GROMACS covar.dat file into a covariance matrix.

    Parameters
    ----------
    file : str
        Path to the covar.dat file.
    dtype : data-type, optional
        The data type for the covariance matrix (default is np.float32).

    Returns
    -------
    np.ndarray
        A reshaped 2D covariance matrix of shape (3*resnum, 3*resnum), where resnum 
        is inferred from the file.
    """
    df = pd.read_csv(file, sep="\\s+", header=None)
    covariance_matrix = np.asarray(df, dtype=dtype)
    resnum = int(np.sqrt(len(covariance_matrix) / 3))
    covariance_matrix = np.reshape(covariance_matrix, (3 * resnum, 3 * resnum))
    return covariance_matrix



################################################################################
## File Filtering and Retrieval Functions
################################################################################


[docs]
def fname_filter(f, sw="", cont="", ew=""):
    """Check if a file name matches the specified start, contained, and end patterns.

    Parameters
    ----------
    f : str
        The file name to check.
    sw : str, optional
        Required starting substring (default is an empty string).
    cont : str, optional
        Required substring to be contained in the name (default is an empty string).
    ew : str, optional
        Required ending substring (default is an empty string).

    Returns
    -------
    bool
        True if the file name satisfies all specified conditions; otherwise, False.
    """
    return f.startswith(sw) and cont in f and f.endswith(ew)




[docs]
def filter_files(fpaths, sw="", cont="", ew=""):
    """Filter a list of file paths based on name patterns.

    Parameters
    ----------
    fpaths : list[Path]
        A list of pathlib.Path objects representing file paths.
    sw : str, optional
        Required starting substring (default is an empty string).
    cont : str, optional
        Required substring to be contained (default is an empty string).
    ew : str, optional
        Required ending substring (default is an empty string).

    Returns
    -------
    list[Path]
        A list of Path objects that match the specified filters.
    """
    return [f for f in fpaths if fname_filter(f.name, sw=sw, cont=cont, ew=ew)]




[docs]
def pull_files(directory, pattern):
    r"""Recursively search for files in a directory matching a given pattern.

    Parameters
    ----------
    directory : str or Path
        The root directory to search.
    pattern : str
        The glob pattern to match files (e.g., \*.txt).

    Returns
    -------
    list[str]
        A list of absolute file paths (as strings) that match the pattern.

    Raises
    ------
    FileNotFoundError
        If the specified directory does not exist or is not a directory.
    """
    base_path = Path(directory)
    if not base_path.exists() or not base_path.is_dir():
        raise FileNotFoundError(
            f"Directory '{directory}' does not exist or is not a directory."
        )
    return [str(p) for p in base_path.rglob(pattern)]




[docs]
def pull_all_files(directory):
    """Recursively retrieve all files in the specified directory and its subdirectories.

    Parameters
    ----------
    directory : str or Path
        The directory to search.

    Returns
    -------
    list[str]
        A list of absolute file paths for all files found.
    """
    return pull_files(directory, pattern="*")



################################################################################
## Data Conversion and I/O Functions
################################################################################


[docs]
def xvg2npy(xvg_path, npy_path, usecols=(0, 1)):
    """Convert a GROMACS XVG file to a NumPy binary file (.npy).

    Parameters
    ----------
    xvg_path : str
        Path to the input XVG file.
    npy_path : str
        Path where the output .npy file will be saved.
    usecols : list of int, optional
        Column indices to read from the XVG file (default is [0, 1]).

    Returns
    -------
    None
    """
    try:
        df = pd.read_csv(xvg_path, sep="\\s+", header=None, usecols=usecols)
    except Exception as exc:
        raise ValueError("Error reading XVG file") from exc
    data = np.squeeze(df.to_numpy().T)
    np.save(npy_path, data)




[docs]
def pdb2system(pdb_path) -> System:
    """Parse a PDB file and return a System object.

    Parameters
    ----------
    pdb_path : str
        Path to the PDB file.

    Returns
    -------
    System
        A System object representing the parsed PDB structure.
    """
    parser = PDBParser(pdb_path)
    return parser.parse()




[docs]
def pdb2atomlist(pdb_path) -> AtomList:
    """Parse a PDB file and return an AtomList object.

    Parameters
    ----------
    pdb_path : str
        Path to the PDB file.

    Returns
    -------
    AtomList
        An AtomList object containing the atoms from the PDB file.
    """
    atoms = AtomList()
    atoms.read_pdb(pdb_path)
    return atoms




[docs]
def read_data(fpath):
    """Read data from a file (.csv, .npy, .dat, or .xvg) and return it as a NumPy array.

    Parameters
    ----------
    fpath : str
        Path to the data file.

    Returns
    -------
    np.ndarray
        The data loaded from the file.

    Raises
    ------
    ValueError
        If the file cannot be read properly or the data does not meet expected criteria.
    """
    ftype = Path(fpath).suffix[1:]
    if ftype == "npy":
        try:
            data = np.load(fpath)
        except Exception as exc:
            raise ValueError("Error loading npy file") from exc
    elif ftype in {"csv", "dat"}:
        try:
            df = pd.read_csv(fpath, sep="\\s+", header=None)
            data = np.squeeze(df.values)
            if data.shape[0] != 1104:
                raise ValueError("Data shape mismatch for csv/dat file")
        except Exception as exc:
            raise ValueError("Error reading csv/dat file") from exc
    elif ftype == "xvg":
        try:
            df = pd.read_csv(fpath, sep="\\s+", header=None, usecols=[1])
            data = np.squeeze(df.values)
            if data.shape[0] > 10000:
                raise ValueError("Data shape too large for xvg file")
        except Exception as exc:
            raise ValueError("Error reading xvg file") from exc
    else:
        raise ValueError("Unsupported file type")
    return data




[docs]
def read_xvg(fpath, usecols=(0, 1)):
    """Read a GROMACS XVG file and return its contents as a Pandas DataFrame.

    Parameters
    ----------
    fpath : str
        Path to the XVG file.
    usecols : list of int, optional
        Column indices to read from the file (default is [0, 1]).

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the selected columns from the XVG file.

    Raises
    ------
    ValueError
        If the file cannot be read.
    """
    try:
        df = pd.read_csv(fpath, sep="\\s+", header=None, usecols=usecols)
    except Exception as exc:
        raise ValueError("Error reading xvg file") from exc
    return df




[docs]
def npy2csv(data, fpath):
    """Save a NumPy array to a file in either .csv or .npy format.

    Parameters
    ----------
    data : np.ndarray
        The data to be saved.
    fpath : str
        Path to the output file. The file extension determines the format 
        (.csv or .npy).

    Returns
    -------
    None
    """
    ftype = Path(fpath).suffix[1:]
    if ftype == "csv":
        df = pd.DataFrame(data)
        df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=",")




[docs]
def save_1d_data(data, ids=None, fpath="dfi.xvg", sep=" "):
    """Save one-dimensional data in GROMACS XVG format.

    Parameters
    ----------
    data : list or np.ndarray
        The y-column data to be saved.
    ids : list or np.ndarray, optional
        The x-column data (e.g., indices). If not provided, defaults to a range 
        starting from 1.
    fpath : str, optional
        Path to the output file (default is 'dfi.xvg').
    sep : str, optional
        Field separator in the output file (default is a single space).

    Returns
    -------
    None
    """
    if ids is None:
        ids = np.arange(1, len(data) + 1).astype(int)
    df = pd.DataFrame({"ids": ids, "data": data})
    df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=sep)




[docs]
def save_2d_data(data, fpath="dfi.xvg", sep=" "):
    """Save two-dimensional data in GROMACS XVG format.

    Parameters
    ----------
    data : list or np.ndarray
        The 2D data to be saved.
    ids : list, optional
        Optional identifiers (unused in this function; provided for interface 
        consistency).
    fpath : str, optional
        Path to the output file (default is 'dfi.xvg').
    sep : str, optional
        Field separator in the output file (default is a single space).

    Returns
    -------
    None
    """
    df = pd.DataFrame(data)
    df.to_csv(fpath, index=False, header=None, float_format="%.3E", sep=sep)