"""
ProteinArea.py
A package to calculate time-series protein area profile of MD data.
Use 2D voronoi cells to create realistic protein area with protein embedded
in lipids. The code assumes the membrane in MD data is oriented in X-Y plane.
"""
import MDAnalysis as mda
import numpy as np
import shapely.geometry as geo
import logging
import time
from MDAnalysis.analysis.base import AnalysisBase
from MDAnalysis.analysis.results import ResultsGroup
from scipy.spatial import Voronoi
def calc_area_per_slice(ag, nopbc=False):
"""
nopbc: flag to turn off pbc images, set to False.
"""
# create coordinates
if nopbc:
points_p, points_np = voronoi_pbc(ag, nopbc)
points = np.concatenate((points_p, points_np))
else:
points_p, points_np, points_pbc = voronoi_pbc(ag)
points = np.concatenate((points_p, points_np, points_pbc))
# if no protein atom is in this slice, return 0
p_size = len(points_p)
if p_size == 0:
return 0.0
# do voronoi
vor = Voronoi(points)
# calculate area
area_per_slice = 0
for region_index in vor.point_region[:p_size]:
region_vertices = vor.vertices[vor.regions[region_index]]
area_per_cell = geo.Polygon(region_vertices).area
area_per_slice += area_per_cell
return area_per_slice
def voronoi_pbc(ag, nopbc=False):
"""
Create pbc images for ag slice.
For current setup, there are in total 8 images as we are doing voronoi
in 2D
nopbc: flag to turn off pbc images, set to False.
"""
x, y, z = ag.dimensions[0:3]
# create protein and non-protein coordinates
points_p = ag.select_atoms("protein").positions[:, 0:2]
points_np = ag.select_atoms("not protein").positions[:, 0:2]
if nopbc:
return points_p, points_np
# create 8 pbc images
pbc_arrays = np.array(
[[x, 0], [x, y], [x, -y], [-x, 0], [-x, y], [-x, -y], [0, y], [0, -y]]
)
# create pbc coordinates
points_pbc = ag.copy().atoms.positions[:, 0:2] + pbc_arrays[0]
for pbc_array in pbc_arrays[1:]:
pos_pbc = ag.copy().atoms.positions[:, 0:2] + pbc_array
points_pbc = np.concatenate((points_pbc, pos_pbc))
return points_p, points_np, points_pbc
def voronoi_plot(points_p, points_np, points_pbc, name=""):
import matplotlib.pyplot as plt
from scipy.spatial import voronoi_plot_2d
p_size = len(points_p)
points = np.concatenate((points_p, points_np, points_pbc))
# voronoi
vor = Voronoi(points)
area_per_slice = 0
fig, ax = plt.subplots()
voronoi_plot_2d(
vor, ax=ax, show_points=False, show_vertices=False, point_size=5
)
plt.scatter(points_p[:, 0], points_p[:, 1], c="b", s=5, label="protein")
plt.scatter(
points_np[:, 0],
points_np[:, 1],
c="r",
s=5,
alpha=0.5,
label="non-protein",
)
plt.scatter(
points_pbc[:, 0],
points_pbc[:, 1],
c="grey",
s=5,
alpha=0.5,
label="pbc",
)
plt.xlim([-160, 300])
plt.ylim([-160, 300])
plt.savefig("./voronoi/area" + name + ".pdf")
plt.legend(loc="best")
plt.tight_layout()
plt.close()
[docs]
class ProteinArea(AnalysisBase):
"""
zmin: estimated zmin, should be lower than
the minimal z of the whole traj. Default=0
zmax: estimated zmax, should be higher
than the maximal z of the whole traj. Default=150
layer: thickness of a layer. default=0.5
nopbc: flag to turn off pbc images, set to False.
showpoints: output the voronoi points at certain layer(default=-1)
"""
@classmethod
def get_supported_backends(cls):
return ("serial", "multiprocessing", "dask")
_analysis_algorithm_is_parallelizable = True
[docs]
def __init__(
self,
atomgroup,
zmin=0,
zmax=150,
layer=0.5,
showpoints=False,
nopbc=False,
**kwargs,
):
super().__init__(atomgroup.universe.trajectory, **kwargs)
self._ag = atomgroup
self._zmin = zmin
self._zmax = zmax
self._layer = layer
self._showpoints = showpoints
self._nopbc = nopbc
self.results.area_per_frame = []
self._slices = np.arange(self._zmin, self._zmax, self._layer)
def _prepare(self):
# Called once before frame analysis starts
# Initializes the result container
# Calculates the Z-slice positions from zmin/zmax using layer thickness
return
def _single_frame(self):
# Loops over each z-slice of the protein and selects atoms within each slice using z-coord
for slice_index, slice in enumerate(self._slices[:-1]):
slice = self._ag.select_atoms(
"prop z > "
+ str(self._slices[slice_index])
+ " and prop z < "
+ str(self._slices[slice_index + 1])
)
# Visualizes Voronoi Tess of each slice if --showpoints is enabled
# Currently disabled because it may not work with parallelization.
# if self._showpoints:
# points_p, points_np, points_pbc = voronoi_pbc(slice)
# voronoi_plot(points_p, points_np, points_pbc, name=str(slice_index))
self.results.area_per_frame.append(
calc_area_per_slice(slice, self._nopbc)
)
def _get_aggregator(self):
# in parallel we concatenate all results from each worker in a flat list
# of length N_FRAMES * N_SLICES
return ResultsGroup(
lookup={"area_per_frame": ResultsGroup.flatten_sequence}
)
def _conclude(self):
# area_per_frame is 2D array of shape N_FRAMES * N_SLICES
self.results.area_per_frame = np.array(self.results.area_per_frame)
self.results.area_per_frame = self.results.area_per_frame.reshape(
(self.n_frames, len(self._slices) - 1)
)
self.results.slice_edges = self._slices
self.results.slice_centers = 0.5 * (
self._slices[:-1] + self._slices[1:]
)
def run(self, *args, **kwargs):
if kwargs.get("backend", "serial") != "serial":
kwargs.pop("progressbar", None)
self._progress = None
return super().run(*args, **kwargs)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="Compute protein cross-sectional area per Z-slice from CG trajectory"
)
parser.add_argument("tpr", help="GROMACS topology file (.tpr)")
parser.add_argument("xtc", help="GROMACS trajectory file (.xtc)")
parser.add_argument(
"--output",
type=str,
default="area_per_frame.npy",
help="output file name for area per frame as flat 1D array with length N_FRAMES * N_SLICES",
)
parser.add_argument(
"--output-slice-edges",
type=str,
default="slice_edges.npy",
help="output file name for slice edges as 1D array with length N_SLICES+1",
)
parser.add_argument(
"--output-slice-centers",
type=str,
default="slice_centers.npy",
help="output file name for slice midpoints as 1D array with length N_SLICES",
)
parser.add_argument(
"--output-time",
type=str,
default="times.npy",
help="output file name for frame times as 1D array with length N_FRAMES",
)
parser.add_argument(
"--output-frames",
type=str,
default="frames.npy",
help="output file name for frame indices as 1D array with length N_FRAMES",
)
parser.add_argument("--zmin", type=float, default=0, help="lower z bound")
parser.add_argument("--zmax", type=float, default=150, help="upper z bound")
parser.add_argument(
"--layer", type=float, default=0.5, help="slice thickness"
)
parser.add_argument(
"--nopbc", action="store_true", help="disable PBC images"
)
parser.add_argument(
"--start", type=int, default=None, help="first frame to process"
)
parser.add_argument(
"--stop", type=int, default=None, help="last frame to process"
)
parser.add_argument(
"--backend",
choices=("serial", "multiprocessing", "dask"),
default="serial",
help="parallel backend",
)
parser.add_argument(
"--workers", type=int, help="number of workers for parallel processing"
)
parser.add_argument("--verbose", action="store_true", help="verbose")
# currently commented out in ProteinArea class
# parser.add_argument(
# "--showpoints", action="store_true", help="plot Voronoi for each slice"
# )
args = parser.parse_args()
logging.basicConfig(
filename="protein_area.log",
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
logging.info("ProteinArea run started! :)")
logging.info(f"Command line arguments: {args}")
overall_start = time.time()
logging.info("Loading MDAnalysis Universe")
u = mda.Universe(args.tpr, args.xtc)
logging.info("MDAnalysis Universe loaded successfully")
ag = u.select_atoms("all")
pa = ProteinArea(
ag, zmin=args.zmin, zmax=args.zmax, layer=args.layer, nopbc=args.nopbc
)
run_start = time.time()
logging.info(
f"pa.run() starting (backend={args.backend}, workers={args.workers})"
)
pa.run(
start=args.start,
stop=args.stop,
backend=args.backend,
n_workers=args.workers,
verbose=args.verbose,
)
run_end = time.time()
run_total = run_end - run_start
logging.info(f"pa.run() completed in {run_total} seconds")
overall_end = time.time()
overall_total = overall_end - overall_start
logging.info(f"Total script duration: {overall_total} seconds")
logging.info("ProteinArea run finished! :)) \n")
np.save(args.output, pa.results.area_per_frame)
logging.info(f"Saved cross-sectional area time series to {args.output}")
np.save(args.output_slice_edges, pa.results.slice_edges)
logging.info(f"Saved slice edges to {args.output_slice_edges}")
np.save(args.output_slice_centers, pa.results.slice_centers)
logging.info(f"Saved slice centers to {args.output_slice_centers}")
np.save(args.output_time, pa.times)
logging.info(f"Saved frame times to {args.output_time}")
np.save(args.output_frames, pa.frames)
logging.info(f"Saved frame indices to {args.output_frames}")