Source code for meld.parse

#
# Copyright 2015 by Justin MacCallum, Alberto Perez, Ken Dill
# All rights reserved
#

"""
Functions to read in sequences, secondary structures, and RDCs

"""

from meld import interfaces
from meld.system import scalers
from meld.system import restraints
from meld.system import indexing
from openmm import unit as u  # type: ignore

from typing import List, Optional, TextIO, NewType
from collections import namedtuple

SequenceString = NewType("SequenceString", str)
SecondaryString = NewType("SecondaryString", str)


aa_map = {
    "A": "ALA",
    "C": "CYS",
    "D": "ASP",
    "E": "GLU",
    "F": "PHE",
    "G": "GLY",
    "H": "HIE",
    "I": "ILE",
    "K": "LYS",
    "L": "LEU",
    "M": "MET",
    "N": "ASN",
    "P": "PRO",
    "Q": "GLN",
    "R": "ARG",
    "S": "SER",
    "T": "THR",
    "V": "VAL",
    "W": "TRP",
    "Y": "TYR",
}


# copy the canonical forms
allowed_residues = [aa for aa in aa_map.values()]
# add the alternate protonation states
allowed_residues += [
    "ASH",
    "GLH",
    "HIE",
    "HID",
    "HIP",
    "LYN",
    "ACE",
    "OHE",
    "NME",
    "NHE",
]


[docs]def get_sequence_from_AA1( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, capped: bool = False, nter: Optional[str] = None, cter: Optional[str] = None, ) -> SequenceString: """ Get the sequence from 1-letter amino acid codes. Args: filename: filename to open content: contents file: object to read from capped: Will know that there are caps. Specify which in nter and cter nter: specify capping residue at the N terminus if not specified in sequence cter: specify capping residue at the C terminus if not specified in sequence Returns: string representation that can be used to create a :class:`SubSystem` Note: Specify exactly one of filename, contents, file """ contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = "".join(lines) output = [] for aa in sequence: try: four_letter = aa_map[aa] output.append(four_letter) except KeyError: raise RuntimeError(f'Unknown amino acid "aa".') # append terminal qualifiers if not capped: output[0] = "N" + output[0] output[-1] = "C" + output[-1] else: if nter: output.insert(0, nter) if cter: output.append(cter) max_aa_per_line = 100 groups = [ output[i : i + max_aa_per_line] for i in range(0, len(sequence), max_aa_per_line) ] lines = [" ".join(group) for group in groups] return SequenceString("\n".join(lines))
[docs]def get_sequence_from_AA3( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, capped: bool = False, nter: Optional[str] = None, cter: Optional[str] = None, ) -> SequenceString: """ Get the sequence from a 3-letter amino acid codes. Args: filename: filename to open content: contents file: object to read from capped: Will know that there are caps. Specify which in nter and cter nter: specify capping residue at the N terminus if not specified in sequence cter: specify capping residue at the C terminus if not specified in sequence Returns: string representation that can be used to create a :class:`SubSystem` Note: Specify exactly one of filename, contents, file """ contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = " ".join(lines).split() output = [] for aa in sequence: if aa not in allowed_residues: raise RuntimeError(f"Unknown residue {aa}.") else: output.append(aa) # append terminal qualifiers if not capped: output[0] = "N" + output[0] output[-1] = "C" + output[-1] else: if nter: output.insert(0, nter) if cter: output.append(cter) return SequenceString(" ".join(output))
[docs]def get_secondary_structure_restraints( system: interfaces.ISystem, scaler: scalers.RestraintScaler, ramp: Optional[scalers.TimeRamp] = None, torsion_force_constant: Optional[u.Quantity] = None, distance_force_constant: Optional[u.Quantity] = None, quadratic_cut: Optional[u.Quantity] = None, first_residue: Optional[indexing.ResidueIndex] = None, min_secondary_match: int = 4, filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> List[restraints.RestraintGroup]: """ Get a list of secondary structure restraints. Args: system: the system scaler: the force scaler ramp: the ramp, default is ConstantRamp torsion_force_constant: force constant for torsions, default 2.5e-2 kJ/mol/deg^2 distance_force_constant: force constant for distances, default 2.5 kJ/mol/nm^2 quadratic_cut: switch from quadratic to linear beyond this distance, default 0.2 nm min_secondary_match: minimum number of elements to match in secondary structure first_residue: residue at which these secondary structure restraints start filename: filename to open content: contents to process file: object to read from Returns A list of :class:`RestraintGroups` Note: Specify exactly one of filename, contents, file. """ torsion_force_constant = ( 2.5e-2 * u.kilojoule_per_mole / u.degree ** 2 if torsion_force_constant is None else torsion_force_constant ) distance_force_constant = ( 2500 * u.kilojoule_per_mole / u.nanometer ** 2 if distance_force_constant is None else distance_force_constant ) quadratic_cut = 0.2 * u.nanometer if quadratic_cut is None else quadratic_cut if ramp is None: ramp = restraints.ConstantRamp() if first_residue is None: first_residue = indexing.ResidueIndex(0) else: assert isinstance(first_residue, indexing.ResidueIndex) if min_secondary_match > 5: raise RuntimeError( "Minimum number of elements to match in secondary structure " "must be less than or equal to 5." ) min_secondary_match = int(min_secondary_match) contents = _get_secondary_sequence(filename, content, file) groups = [] helices = _extract_secondary_runs( contents, "H", 5, min_secondary_match, first_residue ) for helix in helices: rests: List[restraints.SelectableRestraint] = [] for index in range(helix.start + 1, helix.end - 1): phi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index - 1, "C"), system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), -62.5 * u.degree, 17.5 * u.degree, torsion_force_constant, ) psi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), system.index.atom(index + 1, "N"), -42.5 * u.degree, 17.5 * u.degree, torsion_force_constant, ) rests.append(phi) rests.append(psi) d1 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start, "CA"), system.index.atom(helix.start + 3, "CA"), 0 * u.nanometer, 0.485 * u.nanometer, 0.561 * u.nanometer, 0.561 * u.nanometer + quadratic_cut, distance_force_constant, ) d2 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start + 1, "CA"), system.index.atom(helix.start + 4, "CA"), 0 * u.nanometer, 0.485 * u.nanometer, 0.561 * u.nanometer, 0.561 * u.nanometer + quadratic_cut, distance_force_constant, ) d3 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start, "CA"), system.index.atom(helix.start + 4, "CA"), 0 * u.nanometer, 0.581 * u.nanometer, 0.684 * u.nanometer, 0.684 * u.nanometer + quadratic_cut, distance_force_constant, ) rests.append(d1) rests.append(d2) rests.append(d3) group = restraints.RestraintGroup(rests, len(rests)) groups.append(group) extended = _extract_secondary_runs( contents, "E", 5, min_secondary_match, first_residue ) for ext in extended: rests = [] for index in range(ext.start + 1, ext.end - 1): phi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index - 1, "C"), system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), -117.5 * u.degree, 27.5 * u.degree, torsion_force_constant, ) psi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), system.index.atom(index + 1, "N"), 145 * u.degree, 25.0 * u.degree, torsion_force_constant, ) rests.append(phi) rests.append(psi) d1 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start, "CA"), system.index.atom(ext.start + 3, "CA"), 0 * u.nanometer, 0.785 * u.nanometer, 1.063 * u.nanometer, 1.063 * u.nanometer + quadratic_cut, distance_force_constant, ) d2 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start + 1, "CA"), system.index.atom(ext.start + 4, "CA"), 0 * u.nanometer, 0.785 * u.nanometer, 1.063 * u.nanometer, 1.063 * u.nanometer + quadratic_cut, distance_force_constant, ) d3 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start, "CA"), system.index.atom(ext.start + 4, "CA"), 0 * u.nanometer, 1.086 * u.nanometer, 1.394 * u.nanometer, 1.394 * u.nanometer + quadratic_cut, distance_force_constant, ) rests.append(d1) rests.append(d2) rests.append(d3) group = restraints.RestraintGroup(rests, len(rests)) groups.append(group) return groups
def _get_secondary_sequence( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> SecondaryString: contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = "".join(lines) for ss in sequence: if ss not in "HE.": raise RuntimeError(f'Unknown secondary structure type "{ss}"') return SecondaryString(sequence) _SecondaryRun = namedtuple("_SecondaryRun", "start end") def _extract_secondary_runs( content: str, ss_type: str, run_length: int, at_least: int, first_residue: int ) -> List[_SecondaryRun]: # mark the elements that have the correct type has_correct_type = [1 if ss == ss_type else 0 for ss in content] # compute the number of correct elements in each segment length = len(content) totals = [0] * (length - run_length + 1) for index in range(length - run_length + 1): totals[index] = sum(has_correct_type[index : index + run_length]) # add a result whenever we've had at_least correct results = [] for index in range(len(totals)): if totals[index] >= at_least: results.append(_SecondaryRun(index, index + run_length)) # At this point, the runs are zero-based relative to the start of the # secondary structure string. Now, we'll add the offset to make them # relative to the first residue results = [ _SecondaryRun(s.start + first_residue, s.end + first_residue) for s in results ] return results def _handle_arguments( filename: Optional[str], contents: Optional[str], file: Optional[TextIO] ) -> str: set_args = [arg for arg in [filename, contents, file] if arg is not None] if len(set_args) != 1: raise RuntimeError("Must set exactly one of filename, contents or file.") if filename: content = open(filename).read() elif contents: content = contents elif file: content = file.read() return content
[docs]def get_rdc_restraints( system: interfaces.ISystem, alignment_index: int, scaler: restraints.RestraintScaler, ramp: Optional[restraints.TimeRamp] = None, quadratic_cut: Optional[u.Quantity] = None, filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> List[restraints.RdcRestraint]: """ Reads restraints from file and returns as RdcRestraint object. Args: system: the system object for the restraints to be added to. alignment_index: index of the alignment tensor to use scaler: object to scale the force constant. ramp: ramp, default is ConstantRamp() quadratic_cut: restraints become linear beyond this deviation, default 999 Hz filename : filename to open content : contents to process file : object to read from Returns: list of restraints from input Note: The value of kappa is assumed to be in units of Hz :math:`Angstrom^3`. Note: All indexing in the input file is assumed to be 1-based. Note: The expected order of columns in the input file is: - residue i - atom name i - residue j - atom name j - observed splitting (:math:`Hz`) - experiment (ignored) - tolerance (:math:`Hz`) - kappa (:math:`Hz \AA^3`) - force constant (:math:`kJ mol^{-1} Hz^{-2}`) - weight """ quadratic_cut = 999.0 / u.seconds if quadratic_cut is None else quadratic_cut if ramp is None: ramp = restraints.ConstantRamp() n_align = system.num_alignments if alignment_index >= n_align: raise ValueError( f"Alignment index {alignment_index} out of range for system with {n_align} alignments" ) contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] restraint_list = [] for line in lines: cols = line.split() res_i = int(cols[0]) atom_i = cols[1] res_j = int(cols[2]) atom_j = cols[3] obs = float(cols[4]) expt = int(cols[5]) tolerance = float(cols[6]) kappa = float(cols[7]) force_const = float(cols[8]) weight = float(cols[9]) atom_index_i = system.index.atom(res_i, atom_i, one_based=True) atom_index_j = system.index.atom(res_j, atom_j, one_based=True) rest = restraints.RdcRestraint( system, scaler, ramp, atom_index_i, atom_index_j, kappa * u.second ** -1 * u.angstrom ** 3, obs * u.second ** -1, tolerance * u.second ** -1, force_const * u.kilojoule_per_mole * u.second ** 2, quadratic_cut, weight, alignment_index, ) restraint_list.append(rest) return restraint_list