Source code for shifterator.shifts

from . import entropy
from .shifterator import Shift


[docs]class WeightedAvgShift(Shift): """ Shift object for calculating weighted scores of two systems of types, and the shift between them Parameters ---------- type2freq_1, type2freq_2: dict Keys are types of a system and values are frequencies of those types type2score_1, type2score_2: dict or str, optional If dict, types are keys and values are scores associated with each type. If str, the name of a score lexicon included in Shifterator. If None and other type2score is None, defaults to uniform scores across types. Otherwise defaults to the other type2score dict reference_value: str or float, optional The reference score to use to partition scores into two different regimes. If 'average', uses the average score according to type2freq_1 and type2score_1. If None and a lexicon is selected for type2score, uses the respective middle point in that lexicon's scale. Otherwise if None, uses zero as the reference point handle_missing_scores: str, optional If 'error', throws an error whenever a word has a score in one score dictionary but not the other. If 'exclude', excludes any word that is missing a score in one score dictionary from all word shift calculations, regardless if it may have a score in the other dictionary. If 'adopt' and the score is missing in one dictionary, then uses the score from the other dictionary if it is available stop_lens: iterable of 2-tuples, optional Denotes intervals of scores that should be excluded from word shifts calculations. Types with scores in this range will be excluded from word shift calculations stop_words: set, optional Denotes words that should be excluded from word shifts calculations normalization: str, optional If 'variation', normalizes shift scores so that the sum of their absolute values sums to 1. If 'trajectory', normalizes them so that the sum of shift scores is 1 or -1. The trajectory normalization cannot be applied if the total shift score is 0, so scores are left unnormalized if the total is 0 and 'trajectory' is specified """ def __init__( self, type2freq_1, type2freq_2, type2score_1=None, type2score_2=None, reference_value=None, handle_missing_scores="error", stop_lens=None, stop_words=set(), normalization="variation", ): super().__init__( type2freq_1=type2freq_1, type2freq_2=type2freq_2, type2score_1=type2score_1, type2score_2=type2score_2, reference_value=reference_value, handle_missing_scores=handle_missing_scores, stop_lens=stop_lens, stop_words=stop_words, normalization=normalization, )
[docs]class ProportionShift(Shift): """ Shift object for calculating differences in proportions of types across two systems Parameters __________ type2freq_1, type2freq_2: dict Keys are types of a system and values are frequencies of those types """ def __init__(self, type2freq_1, type2freq_2): # Set relative frequency to 0 for types that don't appear type2freq_1 = type2freq_1.copy() type2freq_2 = type2freq_2.copy() types = set(type2freq_1.keys()).union(type2freq_2.keys()) for t in types: if t not in type2freq_1: type2freq_1[t] = 0 elif t not in type2freq_2: type2freq_2[t] = 0 # Initialize shift object super().__init__( type2freq_1=type2freq_1, type2freq_2=type2freq_2, type2score_1=None, type2score_2=None, reference_value=0, handle_missing_scores="error", stop_lens=None, stop_words=None, normalization="variation", )
[docs] def get_shift_graph( self, top_n=50, show_plot=True, detailed=False, text_size_inset=True, cumulative_inset=True, title=None, filename=None, **kwargs ): if title is None: title = "" ax = super().get_shift_graph( top_n=top_n, text_size_inset=text_size_inset, cumulative_inset=cumulative_inset, detailed=detailed, show_plot=show_plot, filename=filename, show_total=False, title=title, **kwargs ) return ax
[docs]class EntropyShift(Shift): """ Shift object for calculating the shift in entropy between two systems Parameters ---------- type2freq_1, type2freq_2: dict Keys are types of a system and values are frequencies of those types base: float, optional Base of the logarithm for calculating entropy alpha: float, optional The parameter for the generalized Tsallis entropy. Setting `alpha=1` recovers the Shannon entropy. Higher `alpha` emphasizes more common types, lower `alpha` emphasizes less common types For details: https://en.wikipedia.org/wiki/Tsallis_entropy reference_value: str or float, optional The reference score to use to partition scores into two different regimes. If 'average', uses the average score according to type2freq_1 and type2score_1. Otherwise, uses zero as the reference point normalization: str, optional If 'variation', normalizes shift scores so that the sum of their absolute values sums to 1. If 'trajectory', normalizes them so that the sum of shift scores is 1 or -1. The trajectory normalization cannot be applied if the total shift score is 0, so scores are left unnormalized if the total is 0 and 'trajectory' is specified """ def __init__( self, type2freq_1, type2freq_2, base=2, alpha=1, reference_value=0, normalization="variation", ): # Get relative frequencies type2freq_1 = type2freq_1.copy() type2freq_2 = type2freq_2.copy() type2p_1 = entropy.get_relative_freqs(type2freq_1) type2p_2 = entropy.get_relative_freqs(type2freq_2) # Get entropy scores type2s_1, type2s_2 = entropy.get_entropy_scores(type2p_1, type2p_2, base, alpha) # Initialize shift super().__init__( type2freq_1=type2freq_1, type2freq_2=type2freq_2, type2score_1=type2s_1, type2score_2=type2s_2, handle_missing_scores="error", stop_lens=None, stop_words=None, reference_value=reference_value, normalization=normalization, ) self.type2p_1 = type2p_1 self.type2p_2 = type2p_2 self.alpha = alpha
[docs] def get_shift_graph( self, top_n=50, show_plot=True, detailed=False, text_size_inset=True, cumulative_inset=True, filename=None, **kwargs ): ax = super().get_shift_graph( top_n=top_n, text_size_inset=text_size_inset, cumulative_inset=cumulative_inset, detailed=detailed, show_plot=show_plot, filename=filename, **kwargs ) return ax
[docs]class KLDivergenceShift(Shift): """ Shift object for calculating the Kullback-Leibler divergence (KLD) between two systems Parameters ---------- type2freq_1, type2freq_2: dict Keys are types of a system and values are frequencies of those types. The KLD will be computed with respect type2freq_1, i.e. D(T2 || T1). For the KLD to be well defined, all types must have nonzero frequencies in both type2freq_1 and type2_freq2 base: float, optional Base of the logarithm for calculating entropy stop_lens: iterable of 2-tuples, optional Denotes intervals that should be excluded when calculating shift scores normalization: str, optional If 'variation', normalizes shift scores so that the sum of their absolute values sums to 1. If 'trajectory', normalizes them so that the sum of shift scores is 1 or -1. The trajectory normalization cannot be applied if the total shift score is 0, so scores are left unnormalized if the total is 0 and 'trajectory' is specified """ def __init__( self, type2freq_1, type2freq_2, base=2, reference_value=0, normalization="variation", ): # Check that KLD is well defined types_1 = set(type2freq_1.keys()) types_2 = set(type2freq_2.keys()) if len(types_2.difference(types_1)) > 0: err = ( "There are types that appear in type2freq_2 but not type2freq_1:" + "the KL divergence is not well-defined" ) raise ValueError(err) # Get relative frequencies type2freq_1 = type2freq_1.copy() type2freq_2 = type2freq_2.copy() type2p_1 = entropy.get_relative_freqs(type2freq_1) type2p_2 = entropy.get_relative_freqs(type2freq_2) # Get surprisal scores type2s_1, type2s_2 = entropy.get_entropy_scores(type2p_1, type2p_2, base, alpha=1) # Initialize shift super().__init__( type2freq_1=type2freq_2, type2freq_2=type2freq_2, type2score_1=type2s_2, type2score_2=type2s_1, handle_missing_scores="error", stop_lens=None, stop_words=None, reference_value=reference_value, normalization=normalization, ) self.type2p_1 = type2p_1 self.type2p_2 = type2p_2
[docs] def get_shift_graph( self, top_n=50, show_plot=True, detailed=False, text_size_inset=True, cumulative_inset=True, title=None, filename=None, **kwargs ): if title is None: title = "" ax = super().get_shift_graph( top_n=top_n, text_size_inset=text_size_inset, cumulative_inset=cumulative_inset, detailed=detailed, show_plot=show_plot, title=title, filename=filename, **kwargs ) return ax
[docs]class JSDivergenceShift(Shift): """ Shift object for calculating the Jensen-Shannon divergence (JSD) between two systems Parameters ---------- type2freq_1, type2freq_2: dict Keys are types of a system and values are frequencies of those types weight_1, weight_2: float Relative weights of type2freq_1 and type2frq_2 when constructing their mixed distribution. Should sum to 1 base: float, optional Base of the logarithm for calculating entropy alpha: float, optional The parameter for the generalized Tsallis entropy. Setting `alpha=1` recovers the Shannon entropy. Higher `alpha` emphasizes more common types, lower `alpha` emphasizes less common types For details: https://en.wikipedia.org/wiki/Tsallis_entropy reference_value: str or float, optional The reference score to use to partition scores into two different regimes. Defaults to zero as the reference point normalization: str, optional If 'variation', normalizes shift scores so that the sum of their absolute values sums to 1. If 'trajectory', normalizes them so that the sum of shift scores is 1 or -1. The trajectory normalization cannot be applied if the total shift score is 0, so scores are left unnormalized if the total is 0 and 'trajectory' is specified """ def __init__( self, type2freq_1, type2freq_2, base=2, weight_1=0.5, weight_2=0.5, alpha=1, reference_value=0, normalization="variation", ): # Check weights if weight_1 + weight_2 != 1: raise ValueError("weight_1 and weight_2 do not sum to 1") # Get relative frequencies type2freq_1 = type2freq_1.copy() type2freq_2 = type2freq_2.copy() type2p_1 = entropy.get_relative_freqs(type2freq_1) type2p_2 = entropy.get_relative_freqs(type2freq_2) # Get shift scores type2m, type2s_1, type2s_2 = entropy.get_jsd_scores( type2p_1, type2p_2, weight_1=weight_1, weight_2=weight_2, base=base, alpha=alpha, ) # Initialize shift object super().__init__( type2freq_1=type2freq_1, type2freq_2=type2freq_2, type2score_1=type2s_1, type2score_2=type2s_2, reference_value=reference_value, handle_missing_scores="error", normalization=normalization, stop_lens=None, stop_words=None, ) self.type2p_1 = type2p_1 self.type2p_2 = type2p_2 self.type2m = type2m self.alpha = alpha
[docs] def get_shift_graph( self, top_n=50, show_plot=True, detailed=False, text_size_inset=True, cumulative_inset=True, title=None, filename=None, **kwargs ): if self.alpha == 1 and self.reference_value == 0: all_pos_contributions = True else: all_pos_contributions = False if title is None: title = "" ax = super().get_shift_graph( top_n=top_n, text_size_inset=text_size_inset, cumulative_inset=cumulative_inset, detailed=detailed, show_plot=show_plot, filename=filename, title=title, all_pos_contributions=all_pos_contributions, **kwargs ) return ax