Source code for spark_bestfit.plotting

"""Visualization utilities for fitted distributions."""

import warnings
from typing import TYPE_CHECKING, List, Optional, Tuple

import numpy as np
import scipy.stats as st

# Optional matplotlib import - users can install with: pip install spark-bestfit[plotting]
try:
    import matplotlib.pyplot as plt
    from matplotlib.axes import Axes
    from matplotlib.figure import Figure

    _MATPLOTLIB_AVAILABLE = True
except ImportError:
    _MATPLOTLIB_AVAILABLE = False
    plt = None  # type: ignore[assignment]
    Axes = None  # type: ignore[assignment,misc]
    Figure = None  # type: ignore[assignment,misc]

from spark_bestfit.fitting import compute_pdf_range, extract_distribution_params


def _check_matplotlib() -> None:
    """Raise helpful error if matplotlib is not installed."""
    if not _MATPLOTLIB_AVAILABLE:
        raise ImportError(
            "Matplotlib is required for plotting. Install with:\n"
            "  pip install spark-bestfit[plotting]\n\n"
            "Alternatively, use result.pdf(), result.cdf(), result.sample() "
            "to get data for your own plots with any visualization library."
        )


def _get_scipy_distribution(dist_name: str):
    """Safely get scipy distribution by name.

    Args:
        dist_name: Name of the scipy.stats distribution

    Returns:
        The scipy distribution class

    Raises:
        ValueError: If distribution name is not found in scipy.stats
    """
    try:
        return getattr(st, dist_name)
    except AttributeError:
        raise ValueError(f"Unknown distribution '{dist_name}'. " f"Must be a valid scipy.stats distribution name.")


def _format_distribution_params(
    result: "DistributionFitResult",
    precision: int = 4,
) -> Tuple[str, str]:
    """Format distribution name and parameters for display.

    Args:
        result: Fitted distribution result
        precision: Number of decimal places for parameter values

    Returns:
        Tuple of (distribution_title, param_string)
        e.g., ("norm(loc=50.0000, scale=10.0000)", "loc=50.0000, scale=10.0000")
    """
    param_names = result.get_param_names()
    param_str = ", ".join([f"{k}={v:.{precision}f}" for k, v in zip(param_names, result.parameters)])
    dist_title = f"{result.distribution}({param_str})"
    return dist_title, param_str


def _validate_histogram_inputs(
    y_hist: np.ndarray,
    x_hist: np.ndarray,
    func_name: str = "plot",
) -> None:
    """Validate histogram input arrays.

    Args:
        y_hist: Histogram density values
        x_hist: Histogram bin centers
        func_name: Name of calling function for error messages

    Raises:
        ValueError: If inputs are invalid
    """
    if y_hist is None or x_hist is None:
        raise ValueError(f"{func_name} requires both y_hist and x_hist arrays")

    if len(y_hist) == 0 or len(x_hist) == 0:
        raise ValueError(f"{func_name} requires non-empty histogram arrays")

    if len(y_hist) != len(x_hist):
        raise ValueError(
            f"{func_name} requires y_hist and x_hist to have same length, " f"got {len(y_hist)} and {len(x_hist)}"
        )


def _blom_positions(n: int) -> np.ndarray:
    """Calculate Blom's plotting positions for probability plots.

    Blom's formula provides plotting positions that work well across
    a wide range of distributions for Q-Q and P-P plots.

    Args:
        n: Number of data points

    Returns:
        Array of plotting positions of length n
    """
    return (np.arange(1, n + 1) - 0.375) / (n + 0.25)


def _render_qq_to_ax(
    ax: "Axes",
    result: "DistributionFitResult",
    data: np.ndarray,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    edge_width: float = 0.5,
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    grid_alpha: float = 0.3,
    show_legend: bool = True,
    legend_fontsize: int = 10,
    reference_label: str = "Reference (y=x)",
) -> Tuple[np.ndarray, np.ndarray]:
    """Render Q-Q plot to an existing axis.

    Args:
        ax: Matplotlib axis to render to
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        marker: Marker style for data points
        marker_size: Size of scatter markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        edge_width: Width of marker edge
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        grid_alpha: Grid transparency (0-1)
        show_legend: Whether to show legend
        legend_fontsize: Font size for legend
        reference_label: Label for reference line in legend

    Returns:
        Tuple of (theoretical_quantiles, sorted_data) for potential further use
    """
    sorted_data = np.sort(data)
    n = len(sorted_data)
    positions = _blom_positions(n)
    theoretical_quantiles = result.ppf(positions)

    ax.scatter(
        theoretical_quantiles,
        sorted_data,
        s=marker_size,
        alpha=marker_alpha,
        c=marker_color,
        marker=marker,
        edgecolors="white",
        linewidth=edge_width,
        label="Data",
        zorder=2,
    )

    min_val = min(theoretical_quantiles.min(), sorted_data.min())
    max_val = max(theoretical_quantiles.max(), sorted_data.max())
    margin = (max_val - min_val) * 0.05
    line_range = [min_val - margin, max_val + margin]

    ax.plot(
        line_range,
        line_range,
        color=line_color,
        linestyle=line_style,
        linewidth=line_width,
        label=reference_label,
        zorder=1,
    )

    ax.set_xlim(line_range)
    ax.set_ylim(line_range)
    ax.set_aspect("equal", adjustable="box")
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    if show_legend:
        ax.legend(fontsize=legend_fontsize, loc="upper left", framealpha=0.9)

    return theoretical_quantiles, sorted_data


def _render_pp_to_ax(
    ax: "Axes",
    result: "DistributionFitResult",
    data: np.ndarray,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    edge_width: float = 0.5,
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    grid_alpha: float = 0.3,
    show_legend: bool = True,
    legend_fontsize: int = 10,
    reference_label: str = "Reference (y=x)",
) -> Tuple[np.ndarray, np.ndarray]:
    """Render P-P plot to an existing axis.

    Args:
        ax: Matplotlib axis to render to
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        marker: Marker style for data points
        marker_size: Size of scatter markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        edge_width: Width of marker edge
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        grid_alpha: Grid transparency (0-1)
        show_legend: Whether to show legend
        legend_fontsize: Font size for legend
        reference_label: Label for reference line in legend

    Returns:
        Tuple of (theoretical_probs, empirical_probs) for potential further use
    """
    sorted_data = np.sort(data)
    n = len(sorted_data)
    empirical_probs = _blom_positions(n)
    theoretical_probs = result.cdf(sorted_data)

    ax.scatter(
        theoretical_probs,
        empirical_probs,
        s=marker_size,
        alpha=marker_alpha,
        c=marker_color,
        marker=marker,
        edgecolors="white",
        linewidth=edge_width,
        label="Data",
        zorder=2,
    )

    ax.plot(
        [0, 1],
        [0, 1],
        color=line_color,
        linestyle=line_style,
        linewidth=line_width,
        label=reference_label,
        zorder=1,
    )

    ax.set_xlim([0, 1])
    ax.set_ylim([0, 1])
    ax.set_aspect("equal", adjustable="box")
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    if show_legend:
        ax.legend(fontsize=legend_fontsize, loc="upper left", framealpha=0.9)

    return theoretical_probs, empirical_probs


if TYPE_CHECKING:
    from spark_bestfit.results import DistributionFitResult



[docs]
def plot_distribution(
    result: "DistributionFitResult",
    y_hist: np.ndarray,
    x_hist: np.ndarray,
    title: str = "",
    xlabel: str = "Value",
    ylabel: str = "Density",
    figsize: Tuple[int, int] = (12, 8),
    dpi: int = 100,
    show_histogram: bool = True,
    histogram_alpha: float = 0.5,
    pdf_linewidth: int = 2,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    legend_fontsize: int = 10,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Plot fitted distribution against data histogram.

    Args:
        result: Fitted distribution result
        y_hist: Histogram density values
        x_hist: Histogram bin centers
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        show_histogram: Show data histogram
        histogram_alpha: Histogram transparency (0-1)
        pdf_linewidth: Line width for PDF curve
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        legend_fontsize: Legend font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> fitter.plot(best, df, 'value', title='Best Fit')
    """
    _check_matplotlib()
    _validate_histogram_inputs(y_hist, x_hist, "plot_distribution")

    # Get scipy distribution and parameters
    dist = _get_scipy_distribution(result.distribution)
    params = result.parameters

    # Extract shape, loc, scale using utility function
    shape, loc, scale = extract_distribution_params(params)

    # Compute PDF range using utility function
    start, end = compute_pdf_range(dist, params, x_hist)

    x_pdf = np.linspace(start, end, 1000)
    y_pdf = dist.pdf(x_pdf, *shape, loc=loc, scale=scale)

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Plot PDF
    ax.plot(
        x_pdf,
        y_pdf,
        "r-",
        lw=pdf_linewidth,
        label="Fitted PDF",
        zorder=3,
    )

    # Plot histogram
    if show_histogram:
        # Convert histogram density to bar plot
        bin_width = x_hist[1] - x_hist[0] if len(x_hist) > 1 else 1.0
        ax.bar(
            x_hist,
            y_hist,
            width=bin_width * 0.9,
            alpha=histogram_alpha,
            label="Data Histogram",
            color="skyblue",
            edgecolor="navy",
            linewidth=0.5,
            zorder=2,
        )

    # Format parameter string
    dist_title, _ = _format_distribution_params(result)
    sse_str = f"SSE: {result.sse:.6f}"

    if result.aic is not None and result.bic is not None:
        metrics_str = f"{sse_str}, AIC: {result.aic:.2f}, BIC: {result.bic:.2f}"
    else:
        metrics_str = sse_str

    # Set title
    full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    # Configure legend
    ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9)

    # Configure grid
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=1)

    # Improve layout
    plt.tight_layout()

    # Save if path provided
    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_comparison(
    results: List["DistributionFitResult"],
    y_hist: np.ndarray,
    x_hist: np.ndarray,
    title: str = "Distribution Comparison",
    xlabel: str = "Value",
    ylabel: str = "Density",
    figsize: Tuple[int, int] = (12, 8),
    dpi: int = 100,
    show_histogram: bool = True,
    histogram_alpha: float = 0.5,
    pdf_linewidth: int = 2,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    legend_fontsize: int = 10,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Plot multiple fitted distributions for comparison.

    Args:
        results: List of DistributionFitResult objects
        y_hist: Histogram density values
        x_hist: Histogram bin centers
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch
        show_histogram: Show data histogram
        histogram_alpha: Histogram transparency
        pdf_linewidth: PDF line width
        title_fontsize: Title font size
        label_fontsize: Label font size
        legend_fontsize: Legend font size
        grid_alpha: Grid transparency
        save_path: Optional path to save figure
        save_format: Save format

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> top_3 = results.best(n=3)
        >>> fitter.plot_comparison(top_3, df, 'value')
    """
    _check_matplotlib()
    _validate_histogram_inputs(y_hist, x_hist, "plot_comparison")

    if not results:
        raise ValueError("Must provide at least one result to plot")

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Plot histogram
    if show_histogram:
        bin_width = x_hist[1] - x_hist[0] if len(x_hist) > 1 else 1.0
        ax.bar(
            x_hist,
            y_hist,
            width=bin_width * 0.9,
            alpha=histogram_alpha,
            label="Data Histogram",
            color="skyblue",
            edgecolor="navy",
            linewidth=0.5,
            zorder=1,
        )

    # Define colors for multiple distributions
    colors = plt.cm.tab10(np.linspace(0, 1, len(results)))

    # Plot each distribution
    for i, result in enumerate(results):
        dist = _get_scipy_distribution(result.distribution)
        params = result.parameters

        # Extract parameters and compute range using utility functions
        shape, loc, scale = extract_distribution_params(params)
        start, end = compute_pdf_range(dist, params, x_hist)

        x_pdf = np.linspace(start, end, 1000)
        y_pdf = dist.pdf(x_pdf, *shape, loc=loc, scale=scale)

        # Plot with label
        label = f"{result.distribution} (SSE={result.sse:.4f})"
        ax.plot(
            x_pdf,
            y_pdf,
            lw=pdf_linewidth,
            label=label,
            color=colors[i],
            zorder=2 + i,
        )

    # Configure plot
    ax.set_title(title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)
    ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9)
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_qq(
    result: "DistributionFitResult",
    data: np.ndarray,
    title: str = "",
    xlabel: str = "Theoretical Quantiles",
    ylabel: str = "Sample Quantiles",
    figsize: Tuple[int, int] = (10, 10),
    dpi: int = 100,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Create a Q-Q (quantile-quantile) plot for goodness-of-fit assessment.

    A Q-Q plot compares the quantiles of the sample data against the theoretical
    quantiles of the fitted distribution. If the data follows the fitted
    distribution well, the points will fall approximately along the reference line.

    Args:
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        marker: Marker style for data points
        marker_size: Size of markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> fitter.plot_qq(best, df, 'value', title='Q-Q Plot')
    """
    _check_matplotlib()

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Render Q-Q plot to axis using shared helper
    _render_qq_to_ax(
        ax,
        result,
        data,
        marker=marker,
        marker_size=marker_size,
        marker_alpha=marker_alpha,
        marker_color=marker_color,
        line_color=line_color,
        line_style=line_style,
        line_width=line_width,
        grid_alpha=grid_alpha,
        show_legend=True,
        legend_fontsize=10,
    )

    # Format title with distribution info
    dist_title, _ = _format_distribution_params(result)

    # Add K-S statistic if available
    if result.ks_statistic is not None:
        metrics_str = f"KS={result.ks_statistic:.6f}"
        if result.pvalue is not None:
            metrics_str += f", p={result.pvalue:.4f}"
    else:
        metrics_str = f"SSE={result.sse:.6f}"

    full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_pp(
    result: "DistributionFitResult",
    data: np.ndarray,
    title: str = "",
    xlabel: str = "Theoretical Probabilities",
    ylabel: str = "Sample Probabilities",
    figsize: Tuple[int, int] = (10, 10),
    dpi: int = 100,
    marker: str = "o",
    marker_size: int = 30,
    marker_alpha: float = 0.6,
    marker_color: str = "steelblue",
    line_color: str = "red",
    line_style: str = "--",
    line_width: float = 1.5,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """
    Create a P-P (probability-probability) plot for goodness-of-fit assessment.

    A P-P plot compares the empirical cumulative distribution function (CDF) of
    the sample data against the theoretical CDF of the fitted distribution.
    It is particularly useful for assessing fit in the center of the distribution.

    Args:
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        marker: Marker style for data points
        marker_size: Size of markers
        marker_alpha: Marker transparency (0-1)
        marker_color: Color of markers
        line_color: Color of reference line
        line_style: Style of reference line
        line_width: Width of reference line
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> fitter.plot_pp(best, df, 'value', title='P-P Plot')
    """
    _check_matplotlib()

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Render P-P plot to axis using shared helper
    _render_pp_to_ax(
        ax,
        result,
        data,
        marker=marker,
        marker_size=marker_size,
        marker_alpha=marker_alpha,
        marker_color=marker_color,
        line_color=line_color,
        line_style=line_style,
        line_width=line_width,
        grid_alpha=grid_alpha,
        show_legend=True,
        legend_fontsize=10,
    )

    # Format title with distribution info
    dist_title, _ = _format_distribution_params(result)

    # Add K-S or SSE metric
    if result.ks_statistic is not None:
        metrics_str = f"KS={result.ks_statistic:.6f}"
        if result.pvalue is not None:
            metrics_str += f", p={result.pvalue:.4f}"
    else:
        metrics_str = f"SSE={result.sse:.6f}"

    full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_discrete_distribution(
    result: "DistributionFitResult",
    data: np.ndarray,
    title: str = "",
    xlabel: str = "Value",
    ylabel: str = "Probability",
    figsize: Tuple[int, int] = (12, 8),
    dpi: int = 100,
    show_histogram: bool = True,
    histogram_alpha: float = 0.7,
    pmf_linewidth: int = 2,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    legend_fontsize: int = 10,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Plot fitted discrete distribution against data histogram.

    Args:
        result: Fitted discrete distribution result
        data: Integer data array
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        show_histogram: Show data histogram
        histogram_alpha: Histogram transparency (0-1)
        pmf_linewidth: Line width for PMF markers
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        legend_fontsize: Legend font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)
    """
    _check_matplotlib()

    # Validate data array
    if data is None or len(data) == 0:
        raise ValueError("plot_discrete_distribution requires non-empty data array")

    # Get scipy distribution using safe helper
    dist = _get_scipy_distribution(result.distribution)
    params = list(result.parameters)

    # Handle integer parameters for certain distributions
    int_param_dists = {"binom", "betabinom", "hypergeom", "nhypergeom", "boltzmann", "zipfian"}
    if result.distribution in int_param_dists:
        params[0] = int(round(params[0]))

    # Compute empirical PMF from data
    data_int = data.astype(int)
    unique_vals, counts = np.unique(data_int, return_counts=True)
    empirical_pmf = counts / len(data_int)

    # Extend range slightly for theoretical PMF
    x_min = max(0, unique_vals.min() - 2)
    x_max = unique_vals.max() + 2
    x_range = np.arange(x_min, x_max + 1)

    # Compute theoretical PMF
    theoretical_pmf = dist.pmf(x_range, *params)

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Plot empirical histogram as bars
    if show_histogram:
        ax.bar(
            unique_vals,
            empirical_pmf,
            width=0.8,
            alpha=histogram_alpha,
            label="Empirical PMF",
            color="skyblue",
            edgecolor="navy",
            linewidth=0.5,
            zorder=2,
        )

    # Plot theoretical PMF as stems/lollipops
    markerline, stemlines, baseline = ax.stem(
        x_range,
        theoretical_pmf,
        linefmt="r-",
        markerfmt="ro",
        basefmt=" ",
        label="Fitted PMF",
    )
    plt.setp(markerline, markersize=6, zorder=3)
    plt.setp(stemlines, linewidth=pmf_linewidth, zorder=3)

    # Format parameter string using helper
    dist_title, _ = _format_distribution_params(result)

    # Build metrics string
    metrics_parts = []
    if result.sse is not None:
        metrics_parts.append(f"SSE: {result.sse:.6f}")
    if result.ks_statistic is not None:
        metrics_parts.append(f"KS: {result.ks_statistic:.4f}")
    metrics_str = ", ".join(metrics_parts)

    # Set title
    full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    # Configure legend
    ax.legend(fontsize=legend_fontsize, loc="best", framealpha=0.9)

    # Configure grid
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=1)

    # Set x-axis to integers
    ax.set_xticks(x_range[:: max(1, len(x_range) // 20)])

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_residual_histogram(
    result: "DistributionFitResult",
    y_hist: np.ndarray,
    x_hist: np.ndarray,
    title: str = "",
    xlabel: str = "Residual (Observed - Expected)",
    ylabel: str = "Frequency",
    figsize: Tuple[int, int] = (10, 8),
    dpi: int = 100,
    bins: int = 30,
    histogram_alpha: float = 0.7,
    histogram_color: str = "steelblue",
    show_zero_line: bool = True,
    zero_line_color: str = "red",
    zero_line_style: str = "--",
    zero_line_width: float = 1.5,
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Plot a histogram of residuals (observed - expected density).

    Residuals are computed as the difference between the empirical density
    (from histogram) and the theoretical density (from fitted distribution).
    A good fit should show residuals centered near zero.

    Args:
        result: Fitted distribution result
        y_hist: Histogram density values (empirical density)
        x_hist: Histogram bin centers
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        bins: Number of bins for the residual histogram
        histogram_alpha: Histogram transparency (0-1)
        histogram_color: Color of histogram bars
        show_zero_line: Whether to show a vertical line at zero
        zero_line_color: Color of the zero reference line
        zero_line_style: Style of the zero reference line
        zero_line_width: Width of the zero reference line
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> y_hist, x_edges = np.histogram(data, bins=50, density=True)
        >>> x_hist = (x_edges[:-1] + x_edges[1:]) / 2
        >>> plot_residual_histogram(best, y_hist, x_hist)
    """
    _check_matplotlib()
    _validate_histogram_inputs(y_hist, x_hist, "plot_residual_histogram")

    # Compute theoretical density at bin centers
    theoretical_density = result.pdf(x_hist)

    # Compute residuals
    residuals = y_hist - theoretical_density

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Plot histogram of residuals
    ax.hist(
        residuals,
        bins=bins,
        alpha=histogram_alpha,
        color=histogram_color,
        edgecolor="white",
        linewidth=0.5,
        label="Residuals",
        zorder=2,
    )

    # Add zero reference line
    if show_zero_line:
        ax.axvline(
            x=0,
            color=zero_line_color,
            linestyle=zero_line_style,
            linewidth=zero_line_width,
            label="Zero",
            zorder=3,
        )

    # Format title with distribution info and residual statistics
    dist_title, _ = _format_distribution_params(result)

    # Compute residual statistics
    mean_resid = np.mean(residuals)
    std_resid = np.std(residuals)
    stats_str = f"Mean={mean_resid:.6f}, Std={std_resid:.6f}"

    full_title = f"{title}\n{dist_title}\n{stats_str}" if title else f"{dist_title}\n{stats_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    # Configure legend and grid
    ax.legend(fontsize=10, loc="upper right", framealpha=0.9)
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_cdf_comparison(
    result: "DistributionFitResult",
    data: np.ndarray,
    title: str = "",
    xlabel: str = "Value",
    ylabel: str = "Cumulative Probability",
    figsize: Tuple[int, int] = (10, 8),
    dpi: int = 100,
    empirical_color: str = "steelblue",
    empirical_linewidth: float = 2.0,
    empirical_alpha: float = 0.8,
    theoretical_color: str = "red",
    theoretical_linewidth: float = 2.0,
    theoretical_linestyle: str = "--",
    title_fontsize: int = 14,
    label_fontsize: int = 12,
    legend_fontsize: int = 10,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, Axes]:
    """Plot empirical CDF overlaid with theoretical CDF from the fitted distribution.

    The empirical CDF is computed from the sample data using the step function.
    The theoretical CDF is computed from the fitted distribution. A good fit
    shows close alignment between the two CDFs.

    Args:
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        title: Plot title
        xlabel: X-axis label
        ylabel: Y-axis label
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        empirical_color: Color of empirical CDF line
        empirical_linewidth: Line width for empirical CDF
        empirical_alpha: Transparency of empirical CDF line
        theoretical_color: Color of theoretical CDF line
        theoretical_linewidth: Line width for theoretical CDF
        theoretical_linestyle: Line style for theoretical CDF
        title_fontsize: Title font size
        label_fontsize: Axis label font size
        legend_fontsize: Legend font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, axis)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> plot_cdf_comparison(best, data, title='CDF Comparison')
    """
    _check_matplotlib()

    # Validate data array
    if data is None or len(data) == 0:
        raise ValueError("plot_cdf_comparison requires non-empty data array")

    # Sort data for empirical CDF
    sorted_data = np.sort(data)
    n = len(sorted_data)

    # Compute empirical CDF (step function)
    empirical_cdf = np.arange(1, n + 1) / n

    # Compute theoretical CDF over the data range
    x_range = np.linspace(sorted_data.min(), sorted_data.max(), 1000)
    theoretical_cdf = result.cdf(x_range)

    # Create figure
    fig, ax = plt.subplots(figsize=figsize)

    # Plot empirical CDF as step function
    ax.step(
        sorted_data,
        empirical_cdf,
        where="post",
        color=empirical_color,
        linewidth=empirical_linewidth,
        alpha=empirical_alpha,
        label="Empirical CDF",
        zorder=2,
    )

    # Plot theoretical CDF as smooth curve
    ax.plot(
        x_range,
        theoretical_cdf,
        color=theoretical_color,
        linewidth=theoretical_linewidth,
        linestyle=theoretical_linestyle,
        label="Theoretical CDF",
        zorder=3,
    )

    # Format title with distribution info
    dist_title, _ = _format_distribution_params(result)

    # Add K-S statistic if available
    if result.ks_statistic is not None:
        metrics_str = f"KS={result.ks_statistic:.6f}"
        if result.pvalue is not None:
            metrics_str += f", p={result.pvalue:.4f}"
    else:
        metrics_str = f"SSE={result.sse:.6f}"

    full_title = f"{title}\n{dist_title}\n{metrics_str}" if title else f"{dist_title}\n{metrics_str}"

    ax.set_title(full_title, fontsize=title_fontsize, pad=15)
    ax.set_xlabel(xlabel, fontsize=label_fontsize)
    ax.set_ylabel(ylabel, fontsize=label_fontsize)

    # Set y-axis limits
    ax.set_ylim([0, 1.05])

    # Configure legend and grid
    ax.legend(fontsize=legend_fontsize, loc="lower right", framealpha=0.9)
    ax.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, ax




[docs]
def plot_diagnostics(
    result: "DistributionFitResult",
    data: np.ndarray,
    y_hist: Optional[np.ndarray] = None,
    x_hist: Optional[np.ndarray] = None,
    bins: int = 50,
    title: str = "",
    figsize: Tuple[int, int] = (14, 12),
    dpi: int = 100,
    title_fontsize: int = 16,
    subplot_title_fontsize: int = 12,
    label_fontsize: int = 10,
    grid_alpha: float = 0.3,
    save_path: Optional[str] = None,
    save_format: str = "png",
) -> Tuple[Figure, np.ndarray]:
    """Create a 2x2 diagnostic plot panel for assessing distribution fit quality.

    Generates four diagnostic plots:
    - Q-Q Plot (top-left): Compares sample quantiles vs theoretical quantiles
    - P-P Plot (top-right): Compares empirical vs theoretical probabilities
    - Residual Histogram (bottom-left): Distribution of fit residuals
    - CDF Comparison (bottom-right): Empirical vs theoretical CDF overlay

    Args:
        result: Fitted distribution result
        data: Sample data array (1D numpy array)
        y_hist: Optional pre-computed histogram density values. If None,
            computed from data using specified bins.
        x_hist: Optional pre-computed histogram bin centers. If None,
            computed from data using specified bins.
        bins: Number of histogram bins (used if y_hist/x_hist not provided)
        title: Overall figure title
        figsize: Figure size (width, height)
        dpi: Dots per inch for saved figures
        title_fontsize: Main title font size
        subplot_title_fontsize: Subplot title font size
        label_fontsize: Axis label font size
        grid_alpha: Grid transparency (0-1)
        save_path: Optional path to save figure
        save_format: Save format (png, pdf, svg)

    Returns:
        Tuple of (figure, array of axes)

    Example:
        >>> from spark_bestfit import DistributionFitter
        >>> fitter = DistributionFitter(spark)
        >>> results = fitter.fit(df, 'value')
        >>> best = results.best(n=1)[0]
        >>> fig, axes = plot_diagnostics(best, data, title='Fit Diagnostics')
    """
    _check_matplotlib()

    # Validate data array
    if data is None or len(data) == 0:
        raise ValueError("plot_diagnostics requires non-empty data array")

    # Compute histogram if not provided
    if y_hist is None or x_hist is None:
        y_hist_computed, x_edges = np.histogram(data, bins=bins, density=True)
        x_hist_computed = (x_edges[:-1] + x_edges[1:]) / 2
        y_hist = y_hist_computed
        x_hist = x_hist_computed

    # Create 2x2 subplot grid
    fig, axes = plt.subplots(2, 2, figsize=figsize)

    # Format distribution info for subplot titles
    dist_info, _ = _format_distribution_params(result, precision=2)

    # Q-Q Plot (top-left)
    ax_qq = axes[0, 0]
    _render_qq_to_ax(
        ax_qq,
        result,
        data,
        marker_size=20,
        marker_alpha=0.5,
        edge_width=0.3,
        grid_alpha=grid_alpha,
        show_legend=True,
        legend_fontsize=8,
        reference_label="y=x",
    )
    ax_qq.set_title("Q-Q Plot", fontsize=subplot_title_fontsize)
    ax_qq.set_xlabel("Theoretical Quantiles", fontsize=label_fontsize)
    ax_qq.set_ylabel("Sample Quantiles", fontsize=label_fontsize)

    # P-P Plot (top-right)
    ax_pp = axes[0, 1]
    _render_pp_to_ax(
        ax_pp,
        result,
        data,
        marker_size=20,
        marker_alpha=0.5,
        edge_width=0.3,
        grid_alpha=grid_alpha,
        show_legend=True,
        legend_fontsize=8,
        reference_label="y=x",
    )
    ax_pp.set_title("P-P Plot", fontsize=subplot_title_fontsize)
    ax_pp.set_xlabel("Theoretical Probabilities", fontsize=label_fontsize)
    ax_pp.set_ylabel("Sample Probabilities", fontsize=label_fontsize)

    # Pre-compute sorted data for CDF plot
    sorted_data = np.sort(data)
    n = len(sorted_data)

    # Residual Histogram (bottom-left)
    ax_resid = axes[1, 0]
    theoretical_density = result.pdf(x_hist)
    residuals = y_hist - theoretical_density
    mean_resid = np.mean(residuals)
    std_resid = np.std(residuals)

    ax_resid.hist(
        residuals,
        bins=30,
        alpha=0.7,
        color="steelblue",
        edgecolor="white",
        linewidth=0.5,
        zorder=2,
    )
    ax_resid.axvline(x=0, color="red", linestyle="--", linewidth=1.5, label="Zero", zorder=3)
    ax_resid.set_title(
        f"Residual Histogram\nMean={mean_resid:.4f}, Std={std_resid:.4f}", fontsize=subplot_title_fontsize
    )
    ax_resid.set_xlabel("Residual (Observed - Expected)", fontsize=label_fontsize)
    ax_resid.set_ylabel("Frequency", fontsize=label_fontsize)
    ax_resid.legend(fontsize=8, loc="upper right")
    ax_resid.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    # CDF Comparison (bottom-right)
    ax_cdf = axes[1, 1]
    empirical_cdf = np.arange(1, n + 1) / n
    x_range = np.linspace(sorted_data.min(), sorted_data.max(), 1000)
    theoretical_cdf = result.cdf(x_range)

    ax_cdf.step(
        sorted_data,
        empirical_cdf,
        where="post",
        color="steelblue",
        linewidth=1.5,
        alpha=0.8,
        label="Empirical CDF",
        zorder=2,
    )
    ax_cdf.plot(
        x_range,
        theoretical_cdf,
        color="red",
        linewidth=1.5,
        linestyle="--",
        label="Theoretical CDF",
        zorder=3,
    )
    ax_cdf.set_ylim([0, 1.05])
    ax_cdf.set_title("CDF Comparison", fontsize=subplot_title_fontsize)
    ax_cdf.set_xlabel("Value", fontsize=label_fontsize)
    ax_cdf.set_ylabel("Cumulative Probability", fontsize=label_fontsize)
    ax_cdf.legend(fontsize=8, loc="lower right")
    ax_cdf.grid(alpha=grid_alpha, linestyle="--", linewidth=0.5, zorder=0)

    # Set overall title
    if title:
        fig.suptitle(f"{title}\n{dist_info}", fontsize=title_fontsize, y=1.02)
    else:
        fig.suptitle(dist_info, fontsize=title_fontsize, y=1.02)

    plt.tight_layout()

    if save_path:
        plt.savefig(save_path, dpi=dpi, format=save_format, bbox_inches="tight")
        warnings.warn(f"Plot saved to: {save_path}", stacklevel=2)

    return fig, axes