Source code for g4hunterpy3.plotting


import numpy as np
import matplotlib.pyplot as plt  # local import so CLI works without mpl
from pathlib import Path
import matplotlib

# so we can edit text if PDFs generated
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

font = {'family' : 'arial',
    	'weight' : 'normal'}

matplotlib.rc('font', **font)

# ................................................................................................
#
[docs] def simple_plot(scores: np.ndarray, out_pdf: Path, dpi: int = 300, line_color = "red", line_width = 0.8) -> None: """ Save a PDF plot of the sliding-window scores. Parameters ---------- scores : np.ndarray Array of sliding-window scores. out_pdf : Path Output PDF file path. dpi : int, optional Dots per inch for the output PDF, by default 300. Returns ------- None No return value but writes to file. """ # define x values x = np.arange(scores.size) # build figure fig = plt.figure() plt.plot(x, scores, line_color, linewidth=line_width) plt.xlim(0, max(1, scores.size)) plt.xlabel("Window start (0-based)") plt.ylabel("G4Hunter score (mean)") plt.grid(True) fig.savefig(str(out_pdf), dpi=dpi) plt.close(fig)
# ................................................................................................ # # ................................................................................................ #
[docs] def complex_plot(hits: list, genome_length: int, out_pdf: Path, nbins: int = 1000, score: float = 1.2, percentile_to_use: int = 95, dpi: int = 300, figsize: tuple = (8, 1.5), colorbar_vmax: float = 3.0, colorbar_vmin: float = None, highlight_regions: list = None, strand_agnostic: bool = True ): """ Save a PDF plot of the sliding-window scores. Parameters ---------- hits : list List of hit objects with start, end, and score attributes. Each hit represents a window with a calculated G4Hunter score. genome_length : int Length of the genome/sequence being analyzed; this is needed to map the hits to the full length sequence. out_pdf : Path Output PDF file path. nbins : int, optional Number of bins for the complex plot, by default 1000. score : float, optional Score threshold used for calling hits, used to set floor on colorbar. By default 1.2. percentile_to_use : int, optional Percentile of scores to use within each bin (e.g., 95 for 95th percentile), by default 95. dpi : int, optional Dots per inch for the output PDF, by default 300. figsize : tuple, optional Figure size as (width, height) in inches, by default (8, 2.0). colorbar_vmax = float, optional Maximum value for colorbar, by default 3.0. colorbar_vmin = float, optional Mininum value for colorbar, by default max(score, 0.0). highlight_regions : list, optional List of [start, end] pairs defining regions to highlight on the x-axis. Each element should be a list or tuple with two integers representing the start and end positions (1-based) of the region to highlight. Highlighted regions are shown as yellow vertical spans with alpha=0.5. By default None (no highlighting). strand_agnostic : bool, optional If True, use absolute G4 scores (ignoring strand) for plotting. If False, use raw scores (which can be negative for C-rich regions). Set to true for dsDNA sequences where both strands can form G4s. By default True. Returns ------- None No return value but writes to file. """ ## sanity check stuff if nbins < 1: raise ValueError("nbins must be >= 1") if percentile_to_use < 0 or percentile_to_use > 100: raise ValueError("percentile_to_use must be between 0 and 100") if genome_length < 1: raise ValueError("genome_length must be >= 1") if dpi < 100: raise ValueError("dpi must be >= 100") # Set colorbar limits based on strand_agnostic mode if colorbar_vmin is None: if strand_agnostic: colorbar_vmin = max(score, 0.0) else: # For strand-specific, use symmetric limits around zero colorbar_vmin = -colorbar_vmax # build up scores and positions arrays from hits scores = [] positions = [] for h in hits: scores.append(h.score) positions.append(h.start) # 1-based positions scores = np.array(scores) positions = np.array(positions) ## Build full-length array with NaNs for missing - initialize with NaNs ## and then overwrite with the real values so we leave "missing" positions as NaN full = np.full(genome_length, np.nan) full[positions - 1] = scores # positions are 1-based, so correct # get strand-agnostic G4 propensity if strand_agnostic: signal = np.abs(full) else: signal = full # build "edges" edges = np.linspace(0, genome_length, nbins + 1, dtype=int) # this should be true but we check to be sure assert len(signal) == genome_length # initialize binned arrays binned = np.full(nbins, np.nan) coverage = np.zeros(nbins) for i in range(nbins): # get a local segment of the signal (absolute G4 score) seg = signal[edges[i]:edges[i+1]] # returns a boolean array where True indicates valid (non-NaN) # data valid = np.isfinite(seg) # fraction of positions with data (e.g. if 1 = all positions have data, 0 = none) coverage[i] = valid.mean() # if we had ANY valid data in this segment... if valid.any(): # get xxx-percentile of valid data in this segment. We take 95th percentile # to capture the strongest G4 signal in this bin, while ignoring outliers. This # serves to highlight regions with strong G4 propensity while avoiding overemphasis # on local overlapping regions (e.g. if you had a run of G4s overlapping each other, # you don't want that to dominate the entire bin). binned[i] = np.nanpercentile(seg, percentile_to_use) # Make it a 1-row "heatmap" heat = np.ma.masked_invalid(binned.reshape(1, -1)) fig = plt.figure(figsize=figsize, dpi=dpi) gs = fig.add_gridspec(3, 1, height_ratios=[0.25, 1, 1], hspace=0.3) # Top subplot for colorbar cbar_ax = fig.add_subplot(gs[0, 0]) cbar_ax.axis('off') # Hide the axes # Middle subplot for heatmap ax = fig.add_subplot(gs[1, 0]) # Use appropriate colormap based on strand_agnostic setting if strand_agnostic: cmap = plt.cm.Reds.copy() else: # Use diverging colormap for strand-specific: blue (C-rich/negative) to red (G-rich/positive) cmap = plt.cm.RdBu_r.copy() cmap.set_bad(color='lightgray') # missing bins im = ax.imshow( heat, aspect='auto', interpolation='nearest', extent=[1, genome_length, 0, 1], cmap=cmap, vmin=colorbar_vmin, vmax=colorbar_vmax) ax.set_yticks([]) ax.set_xlim(1, genome_length) ax.tick_params(axis='x', which='both', bottom=False, labelbottom=False) ax.set_ylabel("G4 propensity", rotation=0, labelpad=35, va='center') # Add highlight regions if provided if highlight_regions is not None: for region in highlight_regions: start, end = region[0], region[1] ax.axvspan(start, end, lw=0, color='y', alpha=0.5) # Create colorbar in the top subplot area cb = plt.colorbar(im, ax=cbar_ax, orientation='horizontal', fraction=0.8, pad=0.1) # Set only min/max ticks from actual data valid_data = binned[np.isfinite(binned)] if len(valid_data) > 0: vmin, vmax = colorbar_vmin, colorbar_vmax cb.set_ticks([]) # Remove automatic ticks # Manually position min/max labels at the edges cb.ax.text(-0.12, 0.9, f'{vmin:.2f}', ha='left', va='top', fontsize=6, transform=cb.ax.transAxes) cb.ax.text(1.12, 0.9, f'{vmax:.2f}', ha='right', va='top', fontsize=6, transform=cb.ax.transAxes) # Bottom subplot for coverage ax2 = fig.add_subplot(gs[2, 0], sharex=ax) coverage_positions = np.linspace(1, genome_length, nbins) bar_width = coverage_positions[1]-coverage_positions[0] ax2.bar(coverage_positions, coverage, color='k',width=bar_width) if highlight_regions is not None: for region in highlight_regions: start, end = region[0], region[1] ax2.axvspan(start, end, lw=0, color='y', alpha=0.5) ax2.set_ylim(0, 1) ax2.set_ylabel("Coverage", rotation=0, labelpad=20, va='center') ax2.set_xticks(np.arange(0, genome_length+1, 20000)) ax2.set_xlabel("Position (bp)") # Add common y-axis label fig.suptitle(f"G4 propensity (bin_size={edges[1]-edges[0]} bps, window percentile={percentile_to_use}%)", y=0.98, fontsize=8) plt.tight_layout(pad=1.0) fig.savefig(str(out_pdf), dpi=dpi, bbox_inches='tight')
# ................................................................................................ #