Source code for pandas_profiling.config

"""Configuration for the package."""
from enum import Enum
from typing import Any, Dict, List, Optional, Tuple

from pydantic import BaseModel, BaseSettings, Field, PrivateAttr


def _merge_dictionaries(dict1: dict, dict2: dict) -> dict:
    """
    Recursive merge dictionaries.

    :param dict1: Base dictionary to merge.
    :param dict2: Dictionary to merge on top of base dictionary.
    :return: Merged dictionary
    """
    for key, val in dict1.items():
        if isinstance(val, dict):
            dict2_node = dict2.setdefault(key, {})
            _merge_dictionaries(val, dict2_node)
        else:
            if key not in dict2:
                dict2[key] = val

    return dict2


[docs]class Dataset(BaseModel): """Metadata of the dataset""" description: str = "" creator: str = "" author: str = "" copyright_holder: str = "" copyright_year: str = "" url: str = ""
[docs]class NumVars(BaseModel): quantiles: List[float] = [0.05, 0.25, 0.5, 0.75, 0.95] skewness_threshold: int = 20 low_categorical_threshold: int = 5 # Set to zero to disable chi_squared_threshold: float = 0.999
[docs]class CatVars(BaseModel): length: bool = True characters: bool = True words: bool = True cardinality_threshold: int = 50 n_obs: int = 5 # Set to zero to disable chi_squared_threshold: float = 0.999 coerce_str_to_date: bool = False redact: bool = False histogram_largest: int = 50 stop_words: List[str] = []
[docs]class BoolVars(BaseModel): n_obs: int = 3 # string to boolean mapping dict mappings: Dict[str, bool] = { "t": True, "f": False, "yes": True, "no": False, "y": True, "n": False, "true": True, "false": False, }
[docs]class FileVars(BaseModel): active: bool = False
[docs]class PathVars(BaseModel): active: bool = False
[docs]class ImageVars(BaseModel): active: bool = False exif: bool = True hash: bool = True
[docs]class UrlVars(BaseModel): active: bool = False
class TimeseriesVars(BaseModel): active: bool = False sortby: Optional[str] = None autocorrelation: float = 0.7 lags: List[int] = [1, 7, 12, 24, 30] significance: float = 0.05 pacf_acf_lag: int = 100
[docs]class Univariate(BaseModel): num: NumVars = NumVars() cat: CatVars = CatVars() image: ImageVars = ImageVars() bool: BoolVars = BoolVars() path: PathVars = PathVars() file: FileVars = FileVars() url: UrlVars = UrlVars() timeseries: TimeseriesVars = TimeseriesVars()
[docs]class MissingPlot(BaseModel): # Force labels when there are > 50 variables force_labels: bool = True cmap: str = "RdBu"
[docs]class ImageType(Enum): svg = "svg" png = "png"
[docs]class CorrelationPlot(BaseModel): cmap: str = "RdBu" bad: str = "#000000"
[docs]class Histogram(BaseModel): # Number of bins (set to 0 to automatically detect the bin size) bins: int = 50 # Maximum number of bins (when bins=0) max_bins: int = 250 x_axis_labels: bool = True
class CatFrequencyPlot(BaseModel): show: bool = True # if false, the category frequency plot is turned off type: str = "bar" # options: 'bar', 'pie' # The cat frequency plot is only rendered if the number of distinct values is # smaller or equal to "max_unique" max_unique: int = 10 # Colors should be a list of matplotlib recognised strings: # --> https://matplotlib.org/stable/tutorials/colors/colors.html # --> matplotlib defaults are used by default colors: Optional[List[str]] = None
[docs]class Plot(BaseModel): missing: MissingPlot = MissingPlot() image_format: ImageType = ImageType.svg correlation: CorrelationPlot = CorrelationPlot() dpi: int = 800 # PNG dpi histogram: Histogram = Histogram() scatter_threshold: int = 1000 cat_freq: CatFrequencyPlot = CatFrequencyPlot()
[docs]class Theme(Enum): united = "united" flatly = "flatly" cosmo = "cosmo" simplex = "simplex"
[docs]class Style(BaseModel): # Primary color used for plotting and text where applicable. @property def primary_color(self) -> str: # This attribute may be deprecated in the future, please use primary_colors[0] return self.primary_colors[0] # Primary color used for comparisons (default: blue, red, green) primary_colors: List[str] = ["#377eb8", "#e41a1c", "#4daf4a"] # Base64-encoded logo image logo: str = "" # HTML Theme (optional, default: None) theme: Optional[Theme] = None # Labels used for comparing reports (private attribute) _labels: List[str] = PrivateAttr(["_"])
[docs]class Html(BaseModel): # Styling options for the HTML report style: Style = Style() # Show navbar navbar_show: bool = True # Minify the html minify_html: bool = True # Offline support use_local_assets: bool = True # If True, single file, else directory with assets inline: bool = True # Assets prefix if inline = True assets_prefix: Optional[str] = None # Internal usage assets_path: Optional[str] = None full_width: bool = False
[docs]class Duplicates(BaseModel): head: int = 10 key: str = "# duplicates"
[docs]class Correlation(BaseModel): key: str = "" calculate: bool = Field(default=True) warn_high_correlations: int = Field(default=10) threshold: float = Field(default=0.5) n_bins: int = Field(default=10)
class Correlations(BaseModel): pearson: Correlation = Correlation(key="pearson") spearman: Correlation = Correlation(key="spearman")
[docs]class Interactions(BaseModel): # Set to False to disable scatter plots continuous: bool = True targets: List[str] = []
[docs]class Samples(BaseModel): head: int = 10 tail: int = 10 random: int = 0
[docs]class Variables(BaseModel): descriptions: dict = {}
class IframeAttribute(Enum): src = "src" srcdoc = "srcdoc" class Iframe(BaseModel): height: str = "800px" width: str = "100%" attribute: IframeAttribute = IframeAttribute.srcdoc class Notebook(BaseModel): """When in a Jupyter notebook""" iframe: Iframe = Iframe()
[docs]class Report(BaseModel): # Numeric precision for displaying statistics precision: int = 8
[docs]class Settings(BaseSettings): # Default prefix to avoid collisions with environment variables class Config: env_prefix = "profile_" # Title of the document title: str = "Pandas Profiling Report" dataset: Dataset = Dataset() variables: Variables = Variables() infer_dtypes: bool = True # Show the description at each variable (in addition to the overview tab) show_variable_description: bool = True # Number of workers (0=multiprocessing.cpu_count()) pool_size: int = 0 # Show the progress bar progress_bar: bool = True # Per variable type description settings vars: Univariate = Univariate() # Sort the variables. Possible values: ascending, descending or None (leaves original sorting) sort: Optional[str] = None missing_diagrams: Dict[str, bool] = { "bar": True, "matrix": True, "heatmap": True, } correlations: Dict[str, Correlation] = { "auto": Correlation(key="auto"), "spearman": Correlation(key="spearman"), "pearson": Correlation(key="pearson"), "kendall": Correlation(key="kendall"), "cramers": Correlation(key="cramers"), "phi_k": Correlation(key="phi_k"), } interactions: Interactions = Interactions() categorical_maximum_correlation_distinct: int = 100 # Use `deep` flag for memory_usage memory_deep: bool = False plot: Plot = Plot() duplicates: Duplicates = Duplicates() samples: Samples = Samples() reject_variables: bool = True # The number of observations to show n_obs_unique: int = 10 n_freq_table_max: int = 10 n_extreme_obs: int = 10 # Report rendering report: Report = Report() html: Html = Html() notebook = Notebook()
[docs] def update(self, updates: dict) -> "Settings": update = _merge_dictionaries(self.dict(), updates) return self.parse_obj(self.copy(update=update))
[docs]class Config: arg_groups: Dict[str, Any] = { "sensitive": { "samples": None, "duplicates": None, "vars": {"cat": {"redact": True}}, }, "dark_mode": { "html": { "style": { "theme": Theme.flatly, "primary_color": "#2c3e50", } } }, "orange_mode": { "html": { "style": { "theme": Theme.united, "primary_color": "#d34615", } } }, "explorative": { "vars": { "cat": {"characters": True, "words": True}, "url": {"active": True}, "path": {"active": True}, "file": {"active": True}, "image": {"active": True}, }, "n_obs_unique": 10, "n_extreme_obs": 10, "n_freq_table_max": 10, "memory_deep": True, }, } _shorthands = { "dataset": { "creator": "", "author": "", "description": "", "copyright_holder": "", "copyright_year": "", "url": "", }, "samples": {"head": 0, "tail": 0, "random": 0}, "duplicates": {"head": 0}, "interactions": {"targets": [], "continuous": False}, "missing_diagrams": { "bar": False, "matrix": False, "heatmap": False, }, "correlations": { "auto": {"calculate": False}, "pearson": {"calculate": False}, "spearman": {"calculate": False}, "kendall": {"calculate": False}, "phi_k": {"calculate": False}, "cramers": {"calculate": False}, }, } @staticmethod def get_arg_groups(key: str) -> dict: kwargs = Config.arg_groups[key] shorthand_args, _ = Config.shorthands(kwargs, split=False) return shorthand_args @staticmethod def shorthands(kwargs: dict, split: bool = True) -> Tuple[dict, dict]: shorthand_args = {} if not split: shorthand_args = kwargs for key, value in list(kwargs.items()): if value is None and key in Config._shorthands: shorthand_args[key] = Config._shorthands[key] if split: del kwargs[key] if split: return shorthand_args, kwargs else: return shorthand_args, {}