Source code for meld.parse

#
# Copyright 2015 by Justin MacCallum, Alberto Perez, Ken Dill
# All rights reserved
#

"""
Functions to read in sequences, secondary structures, and RDCs

"""

from collections import namedtuple
from typing import List, NewType, Optional, TextIO

from openmm import unit as u  # type: ignore

from meld import interfaces
from meld.system import indexing, restraints, scalers

SequenceString = NewType("SequenceString", str)
SecondaryString = NewType("SecondaryString", str)


aa_map = {
    "A": "ALA",
    "C": "CYS",
    "D": "ASP",
    "E": "GLU",
    "F": "PHE",
    "G": "GLY",
    "H": "HIE",
    "I": "ILE",
    "K": "LYS",
    "L": "LEU",
    "M": "MET",
    "N": "ASN",
    "P": "PRO",
    "Q": "GLN",
    "R": "ARG",
    "S": "SER",
    "T": "THR",
    "V": "VAL",
    "W": "TRP",
    "Y": "TYR",
}


# copy the canonical forms
allowed_residues = [aa for aa in aa_map.values()]
# add the alternate protonation states
allowed_residues += [
    "ASH",
    "GLH",
    "HIE",
    "HID",
    "HIP",
    "LYN",
    "ACE",
    "OHE",
    "NME",
    "NHE",
]


[docs]def get_sequence_from_AA1( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, capped: bool = False, nter: Optional[str] = None, cter: Optional[str] = None, ) -> SequenceString: """ Get the sequence from 1-letter amino acid codes. Args: filename: filename to open content: contents file: object to read from capped: Will know that there are caps. Specify which in nter and cter nter: specify capping residue at the N terminus if not specified in sequence cter: specify capping residue at the C terminus if not specified in sequence Returns: string representation that can be used to create a :class:`SubSystem` Note: Specify exactly one of filename, contents, file """ contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = "".join(lines) output = [] for aa in sequence: try: four_letter = aa_map[aa] output.append(four_letter) except KeyError: raise RuntimeError(f'Unknown amino acid "aa".') # append terminal qualifiers if not capped: output[0] = "N" + output[0] output[-1] = "C" + output[-1] else: if nter: output.insert(0, nter) if cter: output.append(cter) max_aa_per_line = 100 groups = [ output[i : i + max_aa_per_line] for i in range(0, len(sequence), max_aa_per_line) ] lines = [" ".join(group) for group in groups] return SequenceString("\n".join(lines))
[docs]def get_sequence_from_AA3( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, capped: bool = False, nter: Optional[str] = None, cter: Optional[str] = None, ) -> SequenceString: """ Get the sequence from a 3-letter amino acid codes. Args: filename: filename to open content: contents file: object to read from capped: Will know that there are caps. Specify which in nter and cter nter: specify capping residue at the N terminus if not specified in sequence cter: specify capping residue at the C terminus if not specified in sequence Returns: string representation that can be used to create a :class:`SubSystem` Note: Specify exactly one of filename, contents, file """ contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = " ".join(lines).split() output = [] for aa in sequence: if aa not in allowed_residues: raise RuntimeError(f"Unknown residue {aa}.") else: output.append(aa) # append terminal qualifiers if not capped: output[0] = "N" + output[0] output[-1] = "C" + output[-1] else: if nter: output.insert(0, nter) if cter: output.append(cter) return SequenceString(" ".join(output))
[docs]def get_secondary_structure_restraints( system: interfaces.ISystem, scaler: scalers.RestraintScaler, ramp: Optional[scalers.TimeRamp] = None, torsion_force_constant: Optional[u.Quantity] = None, distance_force_constant: Optional[u.Quantity] = None, quadratic_cut: Optional[u.Quantity] = None, first_residue: Optional[indexing.ResidueIndex] = None, min_secondary_match: int = 4, filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> List[restraints.RestraintGroup]: """ Get a list of secondary structure restraints. Args: system: the system scaler: the force scaler ramp: the ramp, default is ConstantRamp torsion_force_constant: force constant for torsions, default 2.5e-2 kJ/mol/deg^2 distance_force_constant: force constant for distances, default 2.5 kJ/mol/nm^2 quadratic_cut: switch from quadratic to linear beyond this distance, default 0.2 nm min_secondary_match: minimum number of elements to match in secondary structure first_residue: residue at which these secondary structure restraints start filename: filename to open content: contents to process file: object to read from Returns A list of :class:`RestraintGroups` Note: Specify exactly one of filename, contents, file. """ torsion_force_constant = ( 2.5e-2 * u.kilojoule_per_mole / u.degree**2 if torsion_force_constant is None else torsion_force_constant ) distance_force_constant = ( 2500 * u.kilojoule_per_mole / u.nanometer**2 if distance_force_constant is None else distance_force_constant ) quadratic_cut = 0.2 * u.nanometer if quadratic_cut is None else quadratic_cut if ramp is None: ramp = restraints.ConstantRamp() if first_residue is None: first_residue = indexing.ResidueIndex(0) else: assert isinstance(first_residue, indexing.ResidueIndex) if min_secondary_match > 5: raise RuntimeError( "Minimum number of elements to match in secondary structure " "must be less than or equal to 5." ) min_secondary_match = int(min_secondary_match) contents = _get_secondary_sequence(filename, content, file) groups = [] helices = _extract_secondary_runs( contents, "H", 5, min_secondary_match, first_residue ) for helix in helices: rests: List[restraints.SelectableRestraint] = [] for index in range(helix.start + 1, helix.end - 1): phi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index - 1, "C"), system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), -62.5 * u.degree, 17.5 * u.degree, torsion_force_constant, ) psi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), system.index.atom(index + 1, "N"), -42.5 * u.degree, 17.5 * u.degree, torsion_force_constant, ) rests.append(phi) rests.append(psi) d1 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start, "CA"), system.index.atom(helix.start + 3, "CA"), 0 * u.nanometer, 0.485 * u.nanometer, 0.561 * u.nanometer, 0.561 * u.nanometer + quadratic_cut, distance_force_constant, ) d2 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start + 1, "CA"), system.index.atom(helix.start + 4, "CA"), 0 * u.nanometer, 0.485 * u.nanometer, 0.561 * u.nanometer, 0.561 * u.nanometer + quadratic_cut, distance_force_constant, ) d3 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(helix.start, "CA"), system.index.atom(helix.start + 4, "CA"), 0 * u.nanometer, 0.581 * u.nanometer, 0.684 * u.nanometer, 0.684 * u.nanometer + quadratic_cut, distance_force_constant, ) rests.append(d1) rests.append(d2) rests.append(d3) group = restraints.RestraintGroup(rests, len(rests)) groups.append(group) extended = _extract_secondary_runs( contents, "E", 5, min_secondary_match, first_residue ) for ext in extended: rests = [] for index in range(ext.start + 1, ext.end - 1): phi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index - 1, "C"), system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), -117.5 * u.degree, 27.5 * u.degree, torsion_force_constant, ) psi = restraints.TorsionRestraint( system, scaler, ramp, system.index.atom(index, "N"), system.index.atom(index, "CA"), system.index.atom(index, "C"), system.index.atom(index + 1, "N"), 145 * u.degree, 25.0 * u.degree, torsion_force_constant, ) rests.append(phi) rests.append(psi) d1 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start, "CA"), system.index.atom(ext.start + 3, "CA"), 0 * u.nanometer, 0.785 * u.nanometer, 1.063 * u.nanometer, 1.063 * u.nanometer + quadratic_cut, distance_force_constant, ) d2 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start + 1, "CA"), system.index.atom(ext.start + 4, "CA"), 0 * u.nanometer, 0.785 * u.nanometer, 1.063 * u.nanometer, 1.063 * u.nanometer + quadratic_cut, distance_force_constant, ) d3 = restraints.DistanceRestraint( system, scaler, ramp, system.index.atom(ext.start, "CA"), system.index.atom(ext.start + 4, "CA"), 0 * u.nanometer, 1.086 * u.nanometer, 1.394 * u.nanometer, 1.394 * u.nanometer + quadratic_cut, distance_force_constant, ) rests.append(d1) rests.append(d2) rests.append(d3) group = restraints.RestraintGroup(rests, len(rests)) groups.append(group) return groups
def _get_secondary_sequence( filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> SecondaryString: contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] sequence = "".join(lines) for ss in sequence: if ss not in "HE.": raise RuntimeError(f'Unknown secondary structure type "{ss}"') return SecondaryString(sequence) _SecondaryRun = namedtuple("_SecondaryRun", "start end") def _extract_secondary_runs( content: str, ss_type: str, run_length: int, at_least: int, first_residue: int ) -> List[_SecondaryRun]: # mark the elements that have the correct type has_correct_type = [1 if ss == ss_type else 0 for ss in content] # compute the number of correct elements in each segment length = len(content) totals = [0] * (length - run_length + 1) for index in range(length - run_length + 1): totals[index] = sum(has_correct_type[index : index + run_length]) # add a result whenever we've had at_least correct results = [] for index in range(len(totals)): if totals[index] >= at_least: results.append(_SecondaryRun(index, index + run_length)) # At this point, the runs are zero-based relative to the start of the # secondary structure string. Now, we'll add the offset to make them # relative to the first residue results = [ _SecondaryRun(s.start + first_residue, s.end + first_residue) for s in results ] return results def _handle_arguments( filename: Optional[str], contents: Optional[str], file: Optional[TextIO] ) -> str: set_args = [arg for arg in [filename, contents, file] if arg is not None] if len(set_args) != 1: raise RuntimeError("Must set exactly one of filename, contents or file.") if filename: content = open(filename).read() elif contents: content = contents elif file: content = file.read() return content
[docs]def get_rdc_restraints( system: interfaces.ISystem, alignment_index: int, scaler: restraints.RestraintScaler, ramp: Optional[restraints.TimeRamp] = None, quadratic_cut: Optional[u.Quantity] = None, filename: Optional[str] = None, content: Optional[str] = None, file: Optional[TextIO] = None, ) -> List[restraints.RdcRestraint]: """ Reads restraints from file and returns as RdcRestraint object. Args: system: the system object for the restraints to be added to. alignment_index: index of the alignment tensor to use scaler: object to scale the force constant. ramp: ramp, default is ConstantRamp() quadratic_cut: restraints become linear beyond this deviation, default 999 Hz filename : filename to open content : contents to process file : object to read from Returns: list of restraints from input Note: The value of kappa is assumed to be in units of Hz :math:`Angstrom^3`. Note: All indexing in the input file is assumed to be 1-based. Note: The expected order of columns in the input file is: - residue i - atom name i - residue j - atom name j - observed splitting (:math:`Hz`) - experiment (ignored) - tolerance (:math:`Hz`) - kappa (:math:`Hz \\AA^3`) - force constant (:math:`kJ mol^{-1} Hz^{-2}`) - weight """ quadratic_cut = 999.0 / u.seconds if quadratic_cut is None else quadratic_cut if ramp is None: ramp = restraints.ConstantRamp() n_align = system.num_alignments if alignment_index >= n_align: raise ValueError( f"Alignment index {alignment_index} out of range for system with {n_align} alignments" ) contents = _handle_arguments(filename, content, file) lines = contents.splitlines() lines = [line.strip() for line in lines if not line.startswith("#")] restraint_list = [] for line in lines: cols = line.split() res_i = int(cols[0]) atom_i = cols[1] res_j = int(cols[2]) atom_j = cols[3] obs = float(cols[4]) expt = int(cols[5]) tolerance = float(cols[6]) kappa = float(cols[7]) force_const = float(cols[8]) weight = float(cols[9]) atom_index_i = system.index.atom(res_i, atom_i, one_based=True) atom_index_j = system.index.atom(res_j, atom_j, one_based=True) rest = restraints.RdcRestraint( system, scaler, ramp, atom_index_i, atom_index_j, kappa * u.second**-1 * u.angstrom**3, obs * u.second**-1, tolerance * u.second**-1, force_const * u.kilojoule_per_mole * u.second**2, quadratic_cut, weight, alignment_index, ) restraint_list.append(rest) return restraint_list