Source code for shifterator.shifts

from . import entropy
from .shifterator import Shift


[docs]class WeightedAvgShift(Shift):
    """
    Shift object for calculating weighted scores of two systems of types,
    and the shift between them

    Parameters
    ----------
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types
    type2score_1, type2score_2: dict or str, optional
        If dict, types are keys and values are scores associated with each
        type. If str, the name of a score lexicon included in Shifterator.
        If None and other type2score is None, defaults to uniform scores
        across types. Otherwise defaults to the other type2score dict
    reference_value: str or float, optional
        The reference score to use to partition scores into two different
        regimes. If 'average', uses the average score according to type2freq_1
        and type2score_1. If None and a lexicon is selected for type2score,
        uses the respective middle point in that lexicon's scale. Otherwise
        if None, uses zero as the reference point
    handle_missing_scores: str, optional
        If 'error', throws an error whenever a word has a score in one score
        dictionary but not the other. If 'exclude', excludes any word that is
        missing a score in one score dictionary from all word shift
        calculations, regardless if it may have a score in the other dictionary.
        If 'adopt' and the score is missing in one dictionary, then uses the
        score from the other dictionary if it is available
    stop_lens: iterable of 2-tuples, optional
        Denotes intervals of scores that should be excluded from word shifts
        calculations. Types with scores in this range will be excluded from word
        shift calculations
    stop_words: set, optional
        Denotes words that should be excluded from word shifts calculations
    normalization: str, optional
        If 'variation', normalizes shift scores so that the sum of
        their absolute values sums to 1. If 'trajectory', normalizes
        them so that the sum of shift scores is 1 or -1. The trajectory
        normalization cannot be applied if the total shift score is 0, so
        scores are left unnormalized if the total is 0 and 'trajectory' is
        specified
    """

    def __init__(
        self,
        type2freq_1,
        type2freq_2,
        type2score_1=None,
        type2score_2=None,
        reference_value=None,
        handle_missing_scores="error",
        stop_lens=None,
        stop_words=set(),
        normalization="variation",
    ):
        super().__init__(
            type2freq_1=type2freq_1,
            type2freq_2=type2freq_2,
            type2score_1=type2score_1,
            type2score_2=type2score_2,
            reference_value=reference_value,
            handle_missing_scores=handle_missing_scores,
            stop_lens=stop_lens,
            stop_words=stop_words,
            normalization=normalization,
        )


[docs]class ProportionShift(Shift):
    """
    Shift object for calculating differences in proportions of types across two
    systems

    Parameters
    __________
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types
    """

    def __init__(self, type2freq_1, type2freq_2):
        # Set relative frequency to 0 for types that don't appear
        type2freq_1 = type2freq_1.copy()
        type2freq_2 = type2freq_2.copy()
        types = set(type2freq_1.keys()).union(type2freq_2.keys())
        for t in types:
            if t not in type2freq_1:
                type2freq_1[t] = 0
            elif t not in type2freq_2:
                type2freq_2[t] = 0
        # Initialize shift object
        super().__init__(
            type2freq_1=type2freq_1,
            type2freq_2=type2freq_2,
            type2score_1=None,
            type2score_2=None,
            reference_value=0,
            handle_missing_scores="error",
            stop_lens=None,
            stop_words=None,
            normalization="variation",
        )

[docs]    def get_shift_graph(
        self,
        top_n=50,
        show_plot=True,
        detailed=False,
        text_size_inset=True,
        cumulative_inset=True,
        title=None,
        filename=None,
        **kwargs
    ):
        if title is None:
            title = ""
        ax = super().get_shift_graph(
            top_n=top_n,
            text_size_inset=text_size_inset,
            cumulative_inset=cumulative_inset,
            detailed=detailed,
            show_plot=show_plot,
            filename=filename,
            show_total=False,
            title=title,
            **kwargs
        )
        return ax


[docs]class EntropyShift(Shift):
    """
    Shift object for calculating the shift in entropy between two systems

    Parameters
    ----------
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types
    base: float, optional
        Base of the logarithm for calculating entropy
    alpha: float, optional
        The parameter for the generalized Tsallis entropy. Setting `alpha=1`
        recovers the Shannon entropy. Higher `alpha` emphasizes more common
        types, lower `alpha` emphasizes less common types
        For details: https://en.wikipedia.org/wiki/Tsallis_entropy
    reference_value: str or float, optional
        The reference score to use to partition scores into two different
        regimes. If 'average', uses the average score according to type2freq_1
        and type2score_1. Otherwise, uses zero as the reference point
    normalization: str, optional
        If 'variation', normalizes shift scores so that the sum of
        their absolute values sums to 1. If 'trajectory', normalizes
        them so that the sum of shift scores is 1 or -1. The trajectory
        normalization cannot be applied if the total shift score is 0, so
        scores are left unnormalized if the total is 0 and 'trajectory' is
        specified
    """

    def __init__(
        self,
        type2freq_1,
        type2freq_2,
        base=2,
        alpha=1,
        reference_value=0,
        normalization="variation",
    ):
        # Get relative frequencies
        type2freq_1 = type2freq_1.copy()
        type2freq_2 = type2freq_2.copy()
        type2p_1 = entropy.get_relative_freqs(type2freq_1)
        type2p_2 = entropy.get_relative_freqs(type2freq_2)
        # Get entropy scores
        type2s_1, type2s_2 = entropy.get_entropy_scores(type2p_1, type2p_2, base, alpha)

        # Initialize shift
        super().__init__(
            type2freq_1=type2freq_1,
            type2freq_2=type2freq_2,
            type2score_1=type2s_1,
            type2score_2=type2s_2,
            handle_missing_scores="error",
            stop_lens=None,
            stop_words=None,
            reference_value=reference_value,
            normalization=normalization,
        )
        self.type2p_1 = type2p_1
        self.type2p_2 = type2p_2
        self.alpha = alpha

[docs]    def get_shift_graph(
        self,
        top_n=50,
        show_plot=True,
        detailed=False,
        text_size_inset=True,
        cumulative_inset=True,
        filename=None,
        **kwargs
    ):
        ax = super().get_shift_graph(
            top_n=top_n,
            text_size_inset=text_size_inset,
            cumulative_inset=cumulative_inset,
            detailed=detailed,
            show_plot=show_plot,
            filename=filename,
            **kwargs
        )
        return ax


[docs]class KLDivergenceShift(Shift):
    """
    Shift object for calculating the Kullback-Leibler divergence (KLD) between
    two systems

    Parameters
    ----------
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types.
        The KLD will be computed with respect type2freq_1, i.e. D(T2 || T1).
        For the KLD to be well defined, all types must have nonzero frequencies
        in both type2freq_1 and type2_freq2
    base: float, optional
        Base of the logarithm for calculating entropy
    stop_lens: iterable of 2-tuples, optional
        Denotes intervals that should be excluded when calculating shift
        scores
    normalization: str, optional
        If 'variation', normalizes shift scores so that the sum of
        their absolute values sums to 1. If 'trajectory', normalizes
        them so that the sum of shift scores is 1 or -1. The trajectory
        normalization cannot be applied if the total shift score is 0, so
        scores are left unnormalized if the total is 0 and 'trajectory' is
        specified
    """

    def __init__(
        self,
        type2freq_1,
        type2freq_2,
        base=2,
        reference_value=0,
        normalization="variation",
    ):
        # Check that KLD is well defined
        types_1 = set(type2freq_1.keys())
        types_2 = set(type2freq_2.keys())
        if len(types_2.difference(types_1)) > 0:
            err = (
                "There are types that appear in type2freq_2 but not type2freq_1:"
                + "the KL divergence is not well-defined"
            )
            raise ValueError(err)

        # Get relative frequencies
        type2freq_1 = type2freq_1.copy()
        type2freq_2 = type2freq_2.copy()
        type2p_1 = entropy.get_relative_freqs(type2freq_1)
        type2p_2 = entropy.get_relative_freqs(type2freq_2)

        # Get surprisal scores
        type2s_1, type2s_2 = entropy.get_entropy_scores(type2p_1, type2p_2, base, alpha=1)

        # Initialize shift
        super().__init__(
            type2freq_1=type2freq_2,
            type2freq_2=type2freq_2,
            type2score_1=type2s_2,
            type2score_2=type2s_1,
            handle_missing_scores="error",
            stop_lens=None,
            stop_words=None,
            reference_value=reference_value,
            normalization=normalization,
        )
        self.type2p_1 = type2p_1
        self.type2p_2 = type2p_2

[docs]    def get_shift_graph(
        self,
        top_n=50,
        show_plot=True,
        detailed=False,
        text_size_inset=True,
        cumulative_inset=True,
        title=None,
        filename=None,
        **kwargs
    ):
        if title is None:
            title = ""
        ax = super().get_shift_graph(
            top_n=top_n,
            text_size_inset=text_size_inset,
            cumulative_inset=cumulative_inset,
            detailed=detailed,
            show_plot=show_plot,
            title=title,
            filename=filename,
            **kwargs
        )
        return ax


[docs]class JSDivergenceShift(Shift):
    """
    Shift object for calculating the Jensen-Shannon divergence (JSD) between two
    systems

    Parameters
    ----------
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types
    weight_1, weight_2: float
        Relative weights of type2freq_1 and type2frq_2 when constructing their
        mixed distribution. Should sum to 1
    base: float, optional
        Base of the logarithm for calculating entropy
    alpha: float, optional
        The parameter for the generalized Tsallis entropy. Setting `alpha=1`
        recovers the Shannon entropy. Higher `alpha` emphasizes more common
        types, lower `alpha` emphasizes less common types
        For details: https://en.wikipedia.org/wiki/Tsallis_entropy
    reference_value: str or float, optional
        The reference score to use to partition scores into two different
        regimes. Defaults to zero as the reference point
    normalization: str, optional
        If 'variation', normalizes shift scores so that the sum of
        their absolute values sums to 1. If 'trajectory', normalizes
        them so that the sum of shift scores is 1 or -1. The trajectory
        normalization cannot be applied if the total shift score is 0, so
        scores are left unnormalized if the total is 0 and 'trajectory' is
        specified
    """

    def __init__(
        self,
        type2freq_1,
        type2freq_2,
        base=2,
        weight_1=0.5,
        weight_2=0.5,
        alpha=1,
        reference_value=0,
        normalization="variation",
    ):
        # Check weights
        if weight_1 + weight_2 != 1:
            raise ValueError("weight_1 and weight_2 do not sum to 1")

        # Get relative frequencies
        type2freq_1 = type2freq_1.copy()
        type2freq_2 = type2freq_2.copy()
        type2p_1 = entropy.get_relative_freqs(type2freq_1)
        type2p_2 = entropy.get_relative_freqs(type2freq_2)
        # Get shift scores
        type2m, type2s_1, type2s_2 = entropy.get_jsd_scores(
            type2p_1,
            type2p_2,
            weight_1=weight_1,
            weight_2=weight_2,
            base=base,
            alpha=alpha,
        )

        # Initialize shift object
        super().__init__(
            type2freq_1=type2freq_1,
            type2freq_2=type2freq_2,
            type2score_1=type2s_1,
            type2score_2=type2s_2,
            reference_value=reference_value,
            handle_missing_scores="error",
            normalization=normalization,
            stop_lens=None,
            stop_words=None,
        )
        self.type2p_1 = type2p_1
        self.type2p_2 = type2p_2
        self.type2m = type2m
        self.alpha = alpha

[docs]    def get_shift_graph(
        self,
        top_n=50,
        show_plot=True,
        detailed=False,
        text_size_inset=True,
        cumulative_inset=True,
        title=None,
        filename=None,
        **kwargs
    ):
        if self.alpha == 1 and self.reference_value == 0:
            all_pos_contributions = True
        else:
            all_pos_contributions = False
        if title is None:
            title = ""
        ax = super().get_shift_graph(
            top_n=top_n,
            text_size_inset=text_size_inset,
            cumulative_inset=cumulative_inset,
            detailed=detailed,
            show_plot=show_plot,
            filename=filename,
            title=title,
            all_pos_contributions=all_pos_contributions,
            **kwargs
        )
        return ax
Source code for shifterator.shifts

Shifterator

Navigation

Related Topics