Source code for spark_bestfit.plotting

"""Visualization utilities for fitted distributions."""

import warnings
from typing import TYPE_CHECKING, List, Optional, Tuple

import numpy as np
import scipy.stats as st

# Optional matplotlib import - users can install with: pip install spark-bestfit[plotting]
try:
    import matplotlib.pyplot as plt
    from matplotlib.axes import Axes
    from matplotlib.figure import Figure

    _MATPLOTLIB_AVAILABLE = True
except ImportError:
    _MATPLOTLIB_AVAILABLE = False
    plt = None  # type: ignore[assignment]
    Axes = None  # type: ignore[assignment,misc]
    Figure = None  # type: ignore[assignment,misc]

from spark_bestfit.fitting import compute_pdf_range, extract_distribution_params


def _check_matplotlib() -> None:
    """Raise helpful error if matplotlib is not installed."""
    if not _MATPLOTLIB_AVAILABLE:
        raise ImportError(
            "Matplotlib is required for plotting. Install with:\n"
            "  pip install spark-bestfit[plotting]\n\n"
            "Alternatively, use result.pdf(), result.cdf(), result.sample() "
            "to get data for your own plots with any visualization library."
        )


def _get_scipy_distribution(dist_name: str):
    """Safely get scipy distribution by name.

    Args:
        dist_name: Name of the scipy.stats distribution

    Returns:
        The scipy distribution class

    Raises:
        ValueError: If distribution name is not found in scipy.stats
    """
    try:
        return getattr(st, dist_name)
    except AttributeError:
        raise ValueError(f"Unknown distribution '{dist_name}'. " f"Must be a valid scipy.stats distribution name.")


def _format_distribution_params(
    result: "DistributionFitResult",
    precision: int = 4,
) -> Tuple[str, str]:
    """Format distribution name and parameters for display.

    Args:
        result: Fitted distribution result
        precision: Number of decimal places for parameter values

    Returns:
        Tuple of (distribution_title, param_string)
        e.g., ("norm(loc=50.0000, scale=10.0000)", "loc=50.0000, scale=10.0000")
    """
    param_names = result.get_param_names()
    param_str = ", ".join([f"{k}={v:.{precision}f}" for k, v in zip(param_names, result.parameters)])
    dist_title = f"{result.distribution}({param_str})"
    return dist_title, param_str


def _validate_histogram_inputs(
    y_hist: np.ndarray,
    x_hist: np.ndarray,
    func_name: str = "plot",
) -> None:
    """Validate histogram input arrays.

    Args:
        y_hist: Histogram density values
        x_hist: Histogram bin centers
        func_name: Name of calling function for error messages

    Raises:
        ValueError: If inputs are invalid
    """
    if y_hist is None or x_hist is None:
        raise ValueError(f"{func_name} requires both y_hist and x_hist arrays")

    if len(y_hist) == 0 or len(x_hist) == 0:
        raise ValueError(f"{func_name} requires non-empty histogram arrays")

    if len(y_hist) != len(x_hist):
        raise ValueError(
            f"{func_name} requires y_hist and x_hist to have same length, " f"got {len(y_hist)} and {len(x_hist)}"
        )


def _blom_positions(n: int) -> np.ndarray:
    """Calculate Blom's plotting positions for probability plots.

    Blom's formula provides plotting positions that work well across
    a wide range of distributions for Q-Q and P-P plots.

    Args:
        n: Number of data points

    Returns:
        Array of plotting positions of length n
    """
    return (np.arange(1, n + 1) - 0.375) / (n + 0.25)


def _render_qq_to_ax(
    ax: "Axes",
    result: "DistributionFitResult",
    data: np.ndarray,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    edge_width: float = 0.5,
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    grid_alpha: float = 0.3,
    show_legend: bool = True,
    legend_fontsize: int = 10,
    reference_label: str = "Reference (y=x)",
) -> Tuple[np.ndarray, np.ndarray]:
    """Render Q-Q plot to an existing axis.

    Args:
        ax: Matplotlib axis to render to
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        marker: Marker style for data points
        marker_size: Size of scatter markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        edge_width: Width of marker edge
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        grid_alpha: Grid transparency (0-1)
        show_legend: Whether to show legend
        legend_fontsize: Font size for legend
        reference_label: Label for reference line in legend

    Returns:
        Tuple of (theoretical_quantiles, sorted_data) for potential further use
    """
    sorted_data = np.sort(data)
    n = len(sorted_data)
    positions = _blom_positions(n)
    theoretical_quantiles = result.ppf(positions)

    ax.scatter(
        theoretical_quantiles,
        sorted_data,
        s=marker_size,
        alpha=marker_alpha,
        c=marker_color,
        marker=marker,
        edgecolors="white",
        linewidth=edge_width,
        label="Data",
        zorder=2,
    )

    min_val = min(theoretical_quantiles.min(), sorted_data.min())
    max_val = max(theoretical_quantiles.max(), sorted_data.max())
    margin = (max_val - min_val) * 0.05
    line_range = [min_val - margin, max_val + margin]

    ax.plot(
        line_range,
        line_range,
        color=line_color,
        linestyle=line_style,
        linewidth=line_width,
        label=reference_label,
        zorder=1,
    )

    ax.set_xlim(line_range)
    ax.set_ylim(line_range)
    ax.set_aspect("equal", adjustable="box")
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    if show_legend:
        ax.legend(fontsize=legend_fontsize, loc="upper left", framealpha=0.9)

    return theoretical_quantiles, sorted_data


def _render_pp_to_ax(
    ax: "Axes",
    result: "DistributionFitResult",
    data: np.ndarray,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    edge_width: float = 0.5,
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    grid_alpha: float = 0.3,
    show_legend: bool = True,
    legend_fontsize: int = 10,
    reference_label: str = "Reference (y=x)",
) -> Tuple[np.ndarray, np.ndarray]:
    """Render P-P plot to an existing axis.

    Args:
        ax: Matplotlib axis to render to
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        marker: Marker style for data points
        marker_size: Size of scatter markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        edge_width: Width of marker edge
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        grid_alpha: Grid transparency (0-1)
        show_legend: Whether to show legend
        legend_fontsize: Font size for legend
        reference_label: Label for reference line in legend

    Returns:
        Tuple of (theoretical_probs, empirical_probs) for potential further use
    """
    sorted_data = np.sort(data)
    n = len(sorted_data)
    empirical_probs = _blom_positions(n)
    theoretical_probs = result.cdf(sorted_data)

    ax.scatter(
        theoretical_probs,
        empirical_probs,
        s=marker_size,
        alpha=marker_alpha,
        c=marker_color,
        marker=marker,
        edgecolors="white",
        linewidth=edge_width,
        label="Data",
        zorder=2,
    )

    ax.plot(
        [0, 1],
        [0, 1],
        color=line_color,
        linestyle=line_style,
        linewidth=line_width,
        label=reference_label,
        zorder=1,
    )

    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_aspect("equal", adjustable="box")
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    if show_legend:
        ax.legend(fontsize=legend_fontsize, loc="upper left", framealpha=0.9)

    return theoretical_probs, empirical_probs


if TYPE_CHECKING:
    from spark_bestfit.results import DistributionFitResult


[docs] def plot_distribution( result: "DistributionFitResult", y_hist: np.ndarray, x_hist: np.ndarray, title: str = "", xlabel: str = "Value", ylabel: str = "Density", figsize: Tuple[int, int] = (12, 8), dpi: int = 100, show_histogram: bool = True, histogram_alpha: float = 0.5, pdf_linewidth: int = 2, title_fontsize: int = 14, label_fontsize: int = 12, legend_fontsize: int = 10, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Plot fitted distribution against data histogram. Args: result: Fitted distribution result y_hist: Histogram density values x_hist: Histogram bin centers title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures show_histogram: Show data histogram histogram_alpha: Histogram transparency (0-1) pdf_linewidth: Line width for PDF curve title_fontsize: Title font size label_fontsize: Axis label font size legend_fontsize: Legend font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> fitter.plot(best, df, 'value', title='Best Fit') """ _check_matplotlib() _validate_histogram_inputs(y_hist, x_hist, "plot_distribution") # Get scipy distribution and parameters dist = _get_scipy_distribution(result.distribution) params = result.parameters # Extract shape, loc, scale using utility function shape, loc, scale = extract_distribution_params(params) # Compute PDF range using utility function start, end = compute_pdf_range(dist, params, x_hist) x_pdf = np.linspace(start, end, 1000) y_pdf = dist.pdf(x_pdf, *shape, loc=loc, scale=scale) # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot PDF ax.plot( x_pdf, y_pdf, "r-", lw=pdf_linewidth, label="Fitted PDF", zorder=3, ) # Plot histogram if show_histogram: # Convert histogram density to bar plot bin_width = x_hist[1] - x_hist[0] if len(x_hist) > 1 else 1.0 ax.bar( x_hist, y_hist, width=bin_width * 0.9, alpha=histogram_alpha, label="Data Histogram", color="skyblue", edgecolor="navy", linewidth=0.5, zorder=2, ) # Format parameter string dist_title, _ = _format_distribution_params(result) sse_str = f"SSE: {result.sse:.6f}" if result.aic is not None and result.bic is not None: metrics_str = f"{sse_str}, AIC: {result.aic:.2f}, BIC: {result.bic:.2f}" else: metrics_str = sse_str # Set title full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) # Configure legend ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9) # Configure grid ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=1) # Improve layout plt.tight_layout() # Save if path provided if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_comparison( results: List["DistributionFitResult"], y_hist: np.ndarray, x_hist: np.ndarray, title: str = "Distribution Comparison", xlabel: str = "Value", ylabel: str = "Density", figsize: Tuple[int, int] = (12, 8), dpi: int = 100, show_histogram: bool = True, histogram_alpha: float = 0.5, pdf_linewidth: int = 2, title_fontsize: int = 14, label_fontsize: int = 12, legend_fontsize: int = 10, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Plot multiple fitted distributions for comparison. Args: results: List of DistributionFitResult objects y_hist: Histogram density values x_hist: Histogram bin centers title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch show_histogram: Show data histogram histogram_alpha: Histogram transparency pdf_linewidth: PDF line width title_fontsize: Title font size label_fontsize: Label font size legend_fontsize: Legend font size grid_alpha: Grid transparency save_path: Optional path to save figure save_format: Save format Returns: Tuple of (figure, axis) Example: >>> top_3 = results.best(n=3) >>> fitter.plot_comparison(top_3, df, 'value') """ _check_matplotlib() _validate_histogram_inputs(y_hist, x_hist, "plot_comparison") if not results: raise ValueError("Must provide at least one result to plot") # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot histogram if show_histogram: bin_width = x_hist[1] - x_hist[0] if len(x_hist) > 1 else 1.0 ax.bar( x_hist, y_hist, width=bin_width * 0.9, alpha=histogram_alpha, label="Data Histogram", color="skyblue", edgecolor="navy", linewidth=0.5, zorder=1, ) # Define colors for multiple distributions colors = plt.cm.tab10(np.linspace(0, 1, len(results))) # Plot each distribution for i, result in enumerate(results): dist = _get_scipy_distribution(result.distribution) params = result.parameters # Extract parameters and compute range using utility functions shape, loc, scale = extract_distribution_params(params) start, end = compute_pdf_range(dist, params, x_hist) x_pdf = np.linspace(start, end, 1000) y_pdf = dist.pdf(x_pdf, *shape, loc=loc, scale=scale) # Plot with label label = f"{result.distribution} (SSE={result.sse:.4f})" ax.plot( x_pdf, y_pdf, lw=pdf_linewidth, label=label, color=colors[i], zorder=2 + i, ) # Configure plot ax.set_title(title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9) ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_qq( result: "DistributionFitResult", data: np.ndarray, title: str = "", xlabel: str = "Theoretical Quantiles", ylabel: str = "Sample Quantiles", figsize: Tuple[int, int] = (10, 10), dpi: int = 100, marker: str = "o", marker_size: int = 30, marker_alpha: float = 0.6, marker_color: str = "steelblue", line_color: str = "red", line_style: str = "--", line_width: float = 1.5, title_fontsize: int = 14, label_fontsize: int = 12, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Create a Q-Q (quantile-quantile) plot for goodness-of-fit assessment. A Q-Q plot compares the quantiles of the sample data against the theoretical quantiles of the fitted distribution. If the data follows the fitted distribution well, the points will fall approximately along the reference line. Args: result: Fitted distribution result data: Sample data array (1D numpy array) title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures marker: Marker style for data points marker_size: Size of markers marker_alpha: Marker transparency (0-1) marker_color: Color of markers line_color: Color of reference line line_style: Style of reference line line_width: Width of reference line title_fontsize: Title font size label_fontsize: Axis label font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> fitter.plot_qq(best, df, 'value', title='Q-Q Plot') """ _check_matplotlib() # Create figure fig, ax = plt.subplots(figsize=figsize) # Render Q-Q plot to axis using shared helper _render_qq_to_ax( ax, result, data, marker=marker, marker_size=marker_size, marker_alpha=marker_alpha, marker_color=marker_color, line_color=line_color, line_style=line_style, line_width=line_width, grid_alpha=grid_alpha, show_legend=True, legend_fontsize=10, ) # Format title with distribution info dist_title, _ = _format_distribution_params(result) # Add K-S statistic if available if result.ks_statistic is not None: metrics_str = f"KS={result.ks_statistic:.6f}" if result.pvalue is not None: metrics_str += f", p={result.pvalue:.4f}" else: metrics_str = f"SSE={result.sse:.6f}" full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_pp( result: "DistributionFitResult", data: np.ndarray, title: str = "", xlabel: str = "Theoretical Probabilities", ylabel: str = "Sample Probabilities", figsize: Tuple[int, int] = (10, 10), dpi: int = 100, marker: str = "o", marker_size: int = 30, marker_alpha: float = 0.6, marker_color: str = "steelblue", line_color: str = "red", line_style: str = "--", line_width: float = 1.5, title_fontsize: int = 14, label_fontsize: int = 12, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """ Create a P-P (probability-probability) plot for goodness-of-fit assessment. A P-P plot compares the empirical cumulative distribution function (CDF) of the sample data against the theoretical CDF of the fitted distribution. It is particularly useful for assessing fit in the center of the distribution. Args: result: Fitted distribution result data: Sample data array (1D numpy array) title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures marker: Marker style for data points marker_size: Size of markers marker_alpha: Marker transparency (0-1) marker_color: Color of markers line_color: Color of reference line line_style: Style of reference line line_width: Width of reference line title_fontsize: Title font size label_fontsize: Axis label font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> fitter.plot_pp(best, df, 'value', title='P-P Plot') """ _check_matplotlib() # Create figure fig, ax = plt.subplots(figsize=figsize) # Render P-P plot to axis using shared helper _render_pp_to_ax( ax, result, data, marker=marker, marker_size=marker_size, marker_alpha=marker_alpha, marker_color=marker_color, line_color=line_color, line_style=line_style, line_width=line_width, grid_alpha=grid_alpha, show_legend=True, legend_fontsize=10, ) # Format title with distribution info dist_title, _ = _format_distribution_params(result) # Add K-S or SSE metric if result.ks_statistic is not None: metrics_str = f"KS={result.ks_statistic:.6f}" if result.pvalue is not None: metrics_str += f", p={result.pvalue:.4f}" else: metrics_str = f"SSE={result.sse:.6f}" full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_discrete_distribution( result: "DistributionFitResult", data: np.ndarray, title: str = "", xlabel: str = "Value", ylabel: str = "Probability", figsize: Tuple[int, int] = (12, 8), dpi: int = 100, show_histogram: bool = True, histogram_alpha: float = 0.7, pmf_linewidth: int = 2, title_fontsize: int = 14, label_fontsize: int = 12, legend_fontsize: int = 10, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Plot fitted discrete distribution against data histogram. Args: result: Fitted discrete distribution result data: Integer data array title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures show_histogram: Show data histogram histogram_alpha: Histogram transparency (0-1) pmf_linewidth: Line width for PMF markers title_fontsize: Title font size label_fontsize: Axis label font size legend_fontsize: Legend font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) """ _check_matplotlib() # Validate data array if data is None or len(data) == 0: raise ValueError("plot_discrete_distribution requires non-empty data array") # Get scipy distribution using safe helper dist = _get_scipy_distribution(result.distribution) params = list(result.parameters) # Handle integer parameters for certain distributions int_param_dists = {"binom", "betabinom", "hypergeom", "nhypergeom", "boltzmann", "zipfian"} if result.distribution in int_param_dists: params[0] = int(round(params[0])) # Compute empirical PMF from data data_int = data.astype(int) unique_vals, counts = np.unique(data_int, return_counts=True) empirical_pmf = counts / len(data_int) # Extend range slightly for theoretical PMF x_min = max(0, unique_vals.min() - 2) x_max = unique_vals.max() + 2 x_range = np.arange(x_min, x_max + 1) # Compute theoretical PMF theoretical_pmf = dist.pmf(x_range, *params) # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot empirical histogram as bars if show_histogram: ax.bar( unique_vals, empirical_pmf, width=0.8, alpha=histogram_alpha, label="Empirical PMF", color="skyblue", edgecolor="navy", linewidth=0.5, zorder=2, ) # Plot theoretical PMF as stems/lollipops markerline, stemlines, baseline = ax.stem( x_range, theoretical_pmf, linefmt="r-", markerfmt="ro", basefmt=" ", label="Fitted PMF", ) plt.setp(markerline, markersize=6, zorder=3) plt.setp(stemlines, linewidth=pmf_linewidth, zorder=3) # Format parameter string using helper dist_title, _ = _format_distribution_params(result) # Build metrics string metrics_parts = [] if result.sse is not None: metrics_parts.append(f"SSE: {result.sse:.6f}") if result.ks_statistic is not None: metrics_parts.append(f"KS: {result.ks_statistic:.4f}") metrics_str = ", ".join(metrics_parts) # Set title full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) # Configure legend ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9) # Configure grid ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=1) # Set x-axis to integers ax.set_xticks(x_range[:: max(1, len(x_range) // 20)]) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_residual_histogram( result: "DistributionFitResult", y_hist: np.ndarray, x_hist: np.ndarray, title: str = "", xlabel: str = "Residual (Observed - Expected)", ylabel: str = "Frequency", figsize: Tuple[int, int] = (10, 8), dpi: int = 100, bins: int = 30, histogram_alpha: float = 0.7, histogram_color: str = "steelblue", show_zero_line: bool = True, zero_line_color: str = "red", zero_line_style: str = "--", zero_line_width: float = 1.5, title_fontsize: int = 14, label_fontsize: int = 12, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Plot a histogram of residuals (observed - expected density). Residuals are computed as the difference between the empirical density (from histogram) and the theoretical density (from fitted distribution). A good fit should show residuals centered near zero. Args: result: Fitted distribution result y_hist: Histogram density values (empirical density) x_hist: Histogram bin centers title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures bins: Number of bins for the residual histogram histogram_alpha: Histogram transparency (0-1) histogram_color: Color of histogram bars show_zero_line: Whether to show a vertical line at zero zero_line_color: Color of the zero reference line zero_line_style: Style of the zero reference line zero_line_width: Width of the zero reference line title_fontsize: Title font size label_fontsize: Axis label font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> y_hist, x_edges = np.histogram(data, bins=50, density=True) >>> x_hist = (x_edges[:-1] + x_edges[1:]) / 2 >>> plot_residual_histogram(best, y_hist, x_hist) """ _check_matplotlib() _validate_histogram_inputs(y_hist, x_hist, "plot_residual_histogram") # Compute theoretical density at bin centers theoretical_density = result.pdf(x_hist) # Compute residuals residuals = y_hist - theoretical_density # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot histogram of residuals ax.hist( residuals, bins=bins, alpha=histogram_alpha, color=histogram_color, edgecolor="white", linewidth=0.5, label="Residuals", zorder=2, ) # Add zero reference line if show_zero_line: ax.axvline( x=0, color=zero_line_color, linestyle=zero_line_style, linewidth=zero_line_width, label="Zero", zorder=3, ) # Format title with distribution info and residual statistics dist_title, _ = _format_distribution_params(result) # Compute residual statistics mean_resid = np.mean(residuals) std_resid = np.std(residuals) stats_str = f"Mean={mean_resid:.6f}, Std={std_resid:.6f}" full_title = f"{title}\n{dist_title}\n{stats_str}" if title else f"{dist_title}\n{stats_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) # Configure legend and grid ax.legend(fontsize=10, loc="upper right", framealpha=0.9) ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_cdf_comparison( result: "DistributionFitResult", data: np.ndarray, title: str = "", xlabel: str = "Value", ylabel: str = "Cumulative Probability", figsize: Tuple[int, int] = (10, 8), dpi: int = 100, empirical_color: str = "steelblue", empirical_linewidth: float = 2.0, empirical_alpha: float = 0.8, theoretical_color: str = "red", theoretical_linewidth: float = 2.0, theoretical_linestyle: str = "--", title_fontsize: int = 14, label_fontsize: int = 12, legend_fontsize: int = 10, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, Axes]: """Plot empirical CDF overlaid with theoretical CDF from the fitted distribution. The empirical CDF is computed from the sample data using the step function. The theoretical CDF is computed from the fitted distribution. A good fit shows close alignment between the two CDFs. Args: result: Fitted distribution result data: Sample data array (1D numpy array) title: Plot title xlabel: X-axis label ylabel: Y-axis label figsize: Figure size (width, height) dpi: Dots per inch for saved figures empirical_color: Color of empirical CDF line empirical_linewidth: Line width for empirical CDF empirical_alpha: Transparency of empirical CDF line theoretical_color: Color of theoretical CDF line theoretical_linewidth: Line width for theoretical CDF theoretical_linestyle: Line style for theoretical CDF title_fontsize: Title font size label_fontsize: Axis label font size legend_fontsize: Legend font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, axis) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> plot_cdf_comparison(best, data, title='CDF Comparison') """ _check_matplotlib() # Validate data array if data is None or len(data) == 0: raise ValueError("plot_cdf_comparison requires non-empty data array") # Sort data for empirical CDF sorted_data = np.sort(data) n = len(sorted_data) # Compute empirical CDF (step function) empirical_cdf = np.arange(1, n + 1) / n # Compute theoretical CDF over the data range x_range = np.linspace(sorted_data.min(), sorted_data.max(), 1000) theoretical_cdf = result.cdf(x_range) # Create figure fig, ax = plt.subplots(figsize=figsize) # Plot empirical CDF as step function ax.step( sorted_data, empirical_cdf, where="post", color=empirical_color, linewidth=empirical_linewidth, alpha=empirical_alpha, label="Empirical CDF", zorder=2, ) # Plot theoretical CDF as smooth curve ax.plot( x_range, theoretical_cdf, color=theoretical_color, linewidth=theoretical_linewidth, linestyle=theoretical_linestyle, label="Theoretical CDF", zorder=3, ) # Format title with distribution info dist_title, _ = _format_distribution_params(result) # Add K-S statistic if available if result.ks_statistic is not None: metrics_str = f"KS={result.ks_statistic:.6f}" if result.pvalue is not None: metrics_str += f", p={result.pvalue:.4f}" else: metrics_str = f"SSE={result.sse:.6f}" full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}" ax.set_title(full_title, fontsize=title_fontsize, pad=15) ax.set_xlabel(xlabel, fontsize=label_fontsize) ax.set_ylabel(ylabel, fontsize=label_fontsize) # Set y-axis limits ax.set_ylim([0, 1.05]) # Configure legend and grid ax.legend(fontsize=legend_fontsize, loc="lower right", framealpha=0.9) ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, ax
[docs] def plot_diagnostics( result: "DistributionFitResult", data: np.ndarray, y_hist: Optional[np.ndarray] = None, x_hist: Optional[np.ndarray] = None, bins: int = 50, title: str = "", figsize: Tuple[int, int] = (14, 12), dpi: int = 100, title_fontsize: int = 16, subplot_title_fontsize: int = 12, label_fontsize: int = 10, grid_alpha: float = 0.3, save_path: Optional[str] = None, save_format: str = "png", ) -> Tuple[Figure, np.ndarray]: """Create a 2x2 diagnostic plot panel for assessing distribution fit quality. Generates four diagnostic plots: - Q-Q Plot (top-left): Compares sample quantiles vs theoretical quantiles - P-P Plot (top-right): Compares empirical vs theoretical probabilities - Residual Histogram (bottom-left): Distribution of fit residuals - CDF Comparison (bottom-right): Empirical vs theoretical CDF overlay Args: result: Fitted distribution result data: Sample data array (1D numpy array) y_hist: Optional pre-computed histogram density values. If None, computed from data using specified bins. x_hist: Optional pre-computed histogram bin centers. If None, computed from data using specified bins. bins: Number of histogram bins (used if y_hist/x_hist not provided) title: Overall figure title figsize: Figure size (width, height) dpi: Dots per inch for saved figures title_fontsize: Main title font size subplot_title_fontsize: Subplot title font size label_fontsize: Axis label font size grid_alpha: Grid transparency (0-1) save_path: Optional path to save figure save_format: Save format (png, pdf, svg) Returns: Tuple of (figure, array of axes) Example: >>> from spark_bestfit import DistributionFitter >>> fitter = DistributionFitter(spark) >>> results = fitter.fit(df, 'value') >>> best = results.best(n=1)[0] >>> fig, axes = plot_diagnostics(best, data, title='Fit Diagnostics') """ _check_matplotlib() # Validate data array if data is None or len(data) == 0: raise ValueError("plot_diagnostics requires non-empty data array") # Compute histogram if not provided if y_hist is None or x_hist is None: y_hist_computed, x_edges = np.histogram(data, bins=bins, density=True) x_hist_computed = (x_edges[:-1] + x_edges[1:]) / 2 y_hist = y_hist_computed x_hist = x_hist_computed # Create 2x2 subplot grid fig, axes = plt.subplots(2, 2, figsize=figsize) # Format distribution info for subplot titles dist_info, _ = _format_distribution_params(result, precision=2) # Q-Q Plot (top-left) ax_qq = axes[0, 0] _render_qq_to_ax( ax_qq, result, data, marker_size=20, marker_alpha=0.5, edge_width=0.3, grid_alpha=grid_alpha, show_legend=True, legend_fontsize=8, reference_label="y=x", ) ax_qq.set_title("Q-Q Plot", fontsize=subplot_title_fontsize) ax_qq.set_xlabel("Theoretical Quantiles", fontsize=label_fontsize) ax_qq.set_ylabel("Sample Quantiles", fontsize=label_fontsize) # P-P Plot (top-right) ax_pp = axes[0, 1] _render_pp_to_ax( ax_pp, result, data, marker_size=20, marker_alpha=0.5, edge_width=0.3, grid_alpha=grid_alpha, show_legend=True, legend_fontsize=8, reference_label="y=x", ) ax_pp.set_title("P-P Plot", fontsize=subplot_title_fontsize) ax_pp.set_xlabel("Theoretical Probabilities", fontsize=label_fontsize) ax_pp.set_ylabel("Sample Probabilities", fontsize=label_fontsize) # Pre-compute sorted data for CDF plot sorted_data = np.sort(data) n = len(sorted_data) # Residual Histogram (bottom-left) ax_resid = axes[1, 0] theoretical_density = result.pdf(x_hist) residuals = y_hist - theoretical_density mean_resid = np.mean(residuals) std_resid = np.std(residuals) ax_resid.hist( residuals, bins=30, alpha=0.7, color="steelblue", edgecolor="white", linewidth=0.5, zorder=2, ) ax_resid.axvline(x=0, color="red", linestyle="--", linewidth=1.5, label="Zero", zorder=3) ax_resid.set_title( f"Residual Histogram\nMean={mean_resid:.4f}, Std={std_resid:.4f}", fontsize=subplot_title_fontsize ) ax_resid.set_xlabel("Residual (Observed - Expected)", fontsize=label_fontsize) ax_resid.set_ylabel("Frequency", fontsize=label_fontsize) ax_resid.legend(fontsize=8, loc="upper right") ax_resid.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0) # CDF Comparison (bottom-right) ax_cdf = axes[1, 1] empirical_cdf = np.arange(1, n + 1) / n x_range = np.linspace(sorted_data.min(), sorted_data.max(), 1000) theoretical_cdf = result.cdf(x_range) ax_cdf.step( sorted_data, empirical_cdf, where="post", color="steelblue", linewidth=1.5, alpha=0.8, label="Empirical CDF", zorder=2, ) ax_cdf.plot( x_range, theoretical_cdf, color="red", linewidth=1.5, linestyle="--", label="Theoretical CDF", zorder=3, ) ax_cdf.set_ylim([0, 1.05]) ax_cdf.set_title("CDF Comparison", fontsize=subplot_title_fontsize) ax_cdf.set_xlabel("Value", fontsize=label_fontsize) ax_cdf.set_ylabel("Cumulative Probability", fontsize=label_fontsize) ax_cdf.legend(fontsize=8, loc="lower right") ax_cdf.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0) # Set overall title if title: fig.suptitle(f"{title}\n{dist_info}", fontsize=title_fontsize, y=1.02) else: fig.suptitle(dist_info, fontsize=title_fontsize, y=1.02) plt.tight_layout() if save_path: plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight") warnings.warn(f"Plot saved to: {save_path}", stacklevel=2) return fig, axes