Source code for shifterator.shifterator

import sys
import warnings

import matplotlib.pyplot as plt

from . import helper, plotting


[docs]class Shift:
    """
    Shift object for calculating weighted scores of two systems of types,
    and the shift between them

    Parameters
    ----------
    type2freq_1, type2freq_2: dict
        Keys are types of a system and values are frequencies of those types
    type2score_1, type2score_2: dict or str, optional
        If dict, types are keys and values are scores associated with each
        type. If str, the name of a score lexicon included in Shifterator.
        If None and other type2score is None, defaults to uniform scores
        across types. Otherwise defaults to the other type2score dict
    reference_value: str or float, optional
        The reference score to use to partition scores into two different
        regimes. If 'average', uses the average score according to type2freq_1
        and type2score_1. If None and a lexicon is selected for type2score,
        uses the respective middle point in that lexicon's scale. Otherwise
        if None, uses zero as the reference point
    handle_missing_scores: str, optional
        If 'error', throws an error whenever a word has a score in one score
        dictionary but not the other. If 'exclude', excludes any word that is
        missing a score in one score dictionary from all word shift
        calculations, regardless if it may have a score in the other dictionary.
        If 'adopt' and the score is missing in one dictionary, then uses the
        score from the other dictionary if it is available
    stop_lens: iterable of 2-tuples, optional
        Denotes intervals of scores that should be excluded from word shifts
        calculations. Types with scores in this range will be excluded from word
        shift calculations
    stop_words: set, optional
        Denotes words that should be excluded from calculation of word shifts
    normalization: str, optional
        If 'variation', normalizes shift scores so that the sum of
        their absolute values sums to 1. If 'trajectory', normalizes
        them so that the sum of shift scores is 1 or -1. The trajectory
        normalization cannot be applied if the total shift score is 0, so
        scores are left unnormalized if the total is 0 and 'trajectory' is
        specified
    """

    def __init__(
        self,
        type2freq_1,
        type2freq_2,
        type2score_1=None,
        type2score_2=None,
        reference_value=None,
        handle_missing_scores="error",
        stop_lens=None,
        stop_words=None,
        normalization="variation",
    ):
        # Set type2score dictionaries
        if type2score_1 is not None and type2score_2 is not None:
            self.type2score_1, lex_ref = helper.get_score_dictionary(type2score_1)
            self.type2score_2, _ = helper.get_score_dictionary(type2score_2)
            if type2score_1 != type2score_2:
                self.show_score_diffs = True
            else:
                self.show_score_diffs = False
        elif type2score_1 is not None:
            self.type2score_1, lex_ref = helper.get_score_dictionary(type2score_1)
            self.type2score_2 = self.type2score_1
            self.show_score_diffs = False
        elif type2score_2 is not None:
            self.type2score_2, lex_ref = helper.get_score_dictionary(type2score_2)
            self.type2score_1 = self.type2score_2
            self.show_score_diffs = False
        else:
            lex_ref = None
            self.type2score_1 = {t: 1 for t in type2freq_1}
            self.type2score_2 = {t: 1 for t in type2freq_2}
            self.show_score_diffs = False

        # Preprocess words according to score rules, stop words, and stop lens
        self.handle_missing_scores = handle_missing_scores
        if stop_lens is None:
            self.stop_lens = []
        else:
            self.stop_lens = stop_lens
        if stop_words is None:
            self.stop_words = set()
        else:
            self.stop_words = stop_words
        preprocessed = helper.preprocess_words_scores(type2freq_1,
                                                      self.type2score_1,
                                                      type2freq_2,
                                                      self.type2score_2,
                                                      self.stop_lens,
                                                      self.stop_words,
                                                      self.handle_missing_scores)
        self.type2freq_1 = preprocessed[0]
        self.type2freq_2 = preprocessed[1]
        self.type2score_1 = preprocessed[2]
        self.type2score_2 = preprocessed[3]
        self.types = preprocessed[4]
        self.filtered_types = preprocessed[5]
        self.no_score_types = preprocessed[6]
        self.adopted_score_types = preprocessed[7]

        # Set reference value
        if reference_value is not None:
            if reference_value == "average":
                self.reference_value = self.get_weighted_score(
                    self.type2freq_1, self.type2score_1
                )
            else:
                self.reference_value = reference_value
        else:
            if lex_ref is not None:
                self.reference_value = lex_ref
            else:
                self.reference_value = 0

        # Get shift scores
        self.normalization = normalization
        self.get_shift_scores(details=False)

[docs]    def get_weighted_score(self, type2freq, type2score):
        """
        Calculate an average score according to a set of frequencies and scores

        Parameters
        ----------
        type2freq: dict
            Keys are types and values are frequencies
        type2score: dict
            Keys are types and values are scores

        Returns
        -------
        s_avg: float
            Average weighted score of system
        """
        # Check we have a vocabulary to work with
        types = set(type2freq.keys()).intersection(set(type2score.keys()))
        if len(types) == 0:
            return
        # Get weighted score and total frequency
        f_total = sum([freq for t, freq in type2freq.items() if t in types])
        s_weighted = sum(
            [type2score[t] * freq for t, freq in type2freq.items() if t in types]
        )
        s_avg = s_weighted / f_total
        return s_avg

[docs]    def get_shift_scores(self, details=False):
        """
        Calculates the type shift scores between the two systems

        Parameters
        ----------
        details: boolean
            If true, returns each of the major components of each type's shift
            score, along with the overall shift scores. Otherwise, only returns
            the overall shift scores

        Returns
        -------
        type2p_diff: dict
            If details is True, returns dict where keys are types and values are
            the difference in relatively frequency, i.e. p_i,2 - p_i,1 for type i
        type2s_diff: dict,
            If details is True, returns dict where keys are types and values are
            the relative differences in score, i.e. s_i,2 - s_i,1 for type i
        type2p_avg: dict,
            If details is True, returns dict where keys are types and values are
            the average relative frequencies, i.e. 0.5*(p_i,1+p_i,2) for type i
        type2s_ref_diff: dict
            If details is True, returns dict where keys are types and values are
            relative deviation from reference score, i.e. 0.5*(s_i,2+s_i,1)-s_ref
            for type i
        type2shift_score: dict
            Keys are types and values are shift scores. The overall shift scores
            are normalized according to the `normalization` parameter of the
            Shift object
        """
        s_avg_ref = self.reference_value

        # Get total frequencies
        total_freq_1 = sum(
            [freq for t, freq in self.type2freq_1.items() if t in self.types]
        )
        total_freq_2 = sum(
            [freq for t, freq in self.type2freq_2.items() if t in self.types]
        )
        # Get relative frequency of types in both systems
        type2p_1 = {
            t: self.type2freq_1[t] / total_freq_1 if t in self.type2freq_1 else 0
            for t in self.types
        }
        type2p_2 = {
            t: self.type2freq_2[t] / total_freq_2 if t in self.type2freq_2 else 0
            for t in self.types
        }

        # Calculate shift components
        type2p_avg = dict()
        type2p_diff = dict()
        type2s_diff = dict()
        type2s_ref_diff = dict()
        type2shift_score = dict()
        for t in self.types:
            type2p_avg[t] = 0.5 * (type2p_1[t] + type2p_2[t])
            type2p_diff[t] = type2p_2[t] - type2p_1[t]
            type2s_diff[t] = self.type2score_2[t] - self.type2score_1[t]
            type2s_ref_diff[t] = (
                0.5 * (self.type2score_2[t] + self.type2score_1[t]) - s_avg_ref
            )
            type2shift_score[t] = (
                type2p_diff[t] * type2s_ref_diff[t] + type2s_diff[t] * type2p_avg[t]
            )

        # Normalize the total shift scores
        total_diff = sum(type2shift_score.values())
        self.diff = total_diff
        if total_diff == 0:
            warnings.warn("Score normalization is not well-defined because the total score diff is 0. Setting norm to 1")
            self.norm = 1
        elif self.normalization == "variation":
            abs_sum = sum(abs(s) for s in type2shift_score.values())
            self.norm = abs_sum
        elif self.normalization == "trajectory" and total_diff != 0:
            self.norm = abs(total_diff)
        else:
            self.norm = 1
        type2shift_score = {
            t: shift_score / self.norm for t, shift_score in type2shift_score.items()
        }

        # Set results in shift object
        self.type2p_diff = type2p_diff
        self.type2s_diff = type2s_diff
        self.type2p_avg = type2p_avg
        self.type2s_ref_diff = type2s_ref_diff
        self.type2shift_score = type2shift_score
        # Return shift scores
        if details:
            return (
                type2p_diff,
                type2s_diff,
                type2p_avg,
                type2s_ref_diff,
                type2shift_score,
            )
        else:
            return type2shift_score

[docs]    def get_shift_component_sums(self):
        """
        Calculates the cumulative contribution of each component of the different
        kinds of shift scores.

        Returns
        -------
        Dictionary with six keys, one for each of the different component
        contributions: pos_s_pos_p, pos_s_neg_p, neg_s_pos_p, neg_s_neg_p,
        pos_s, neg_s. Values are the total contribution from that component
        across all types
        """
        # Get shift scores
        if self.type2shift_score is None:
            shift_scores = self.get_shift_scores(details=True)
        else:
            shift_scores = [
                (
                    t,
                    self.type2p_diff[t],
                    self.type2s_diff[t],
                    self.type2p_avg[t],
                    self.type2s_ref_diff[t],
                    self.type2shift_score[t],
                )
                for t in self.type2s_diff
            ]

        # Sum up components of shift score
        pos_s_pos_p = 0
        pos_s_neg_p = 0
        neg_s_pos_p = 0
        neg_s_neg_p = 0
        pos_s = 0
        neg_s = 0
        for t, p_diff, s_diff, p_avg, s_ref_diff, _ in shift_scores:
            # Get contribution of p_diff*s_ref_diff term
            if s_ref_diff > 0:
                if p_diff > 0:
                    pos_s_pos_p += p_diff * s_ref_diff
                else:
                    pos_s_neg_p += p_diff * s_ref_diff
            else:
                if p_diff > 0:
                    neg_s_pos_p += p_diff * s_ref_diff
                else:
                    neg_s_neg_p += p_diff * s_ref_diff
            # Get contribution of s_diff term
            if s_diff > 0:
                pos_s += p_avg * s_diff
            else:
                neg_s += p_avg * s_diff
        return {
            "pos_s_pos_p": pos_s_pos_p,
            "pos_s_neg_p": pos_s_neg_p,
            "neg_s_pos_p": neg_s_pos_p,
            "neg_s_neg_p": neg_s_neg_p,
            "pos_s": pos_s,
            "neg_s": neg_s,
        }

[docs]    def get_shift_graph(
        self,
        ax=None,
        top_n=50,
        text_size_inset=True,
        cumulative_inset=True,
        show_plot=True,
        filename=None,
        **kwargs
    ):
        """
        Plot the shift graph between two systems of types

        Parameters
        ----------
        ax: matplotlib.pyplot.axes.Axes, optional
            Axes to draw figure onto. Will create new axes if none are given.
        top_n: int, optional
            Display the top_n types as sorted by their absolute contribution to
            the difference between systems
        cumulative_inset: bool, optional
            Whether to show an inset showing the cumulative contributions to the
            shift by ranked types
        text_size_inset: bool, optional
            Whether to show an inset showing the relative sizes of each system
        show_plot: bool, optional
            Whether to show plot when it is done being rendered
        filename: str, optional
            If not None, name of the file for saving the shift graph

        Returns
        -------
        ax
            Matplotlib ax of shift graph. Displays shift graph if show_plot=True
        """
        # Set plotting parameters
        kwargs = plotting.get_plot_params(kwargs, self.show_score_diffs, self.diff)

        # Get type score components
        type_scores = [
            (
                t,
                self.type2p_diff[t],
                self.type2s_diff[t],
                self.type2p_avg[t],
                self.type2s_ref_diff[t],
                self.type2shift_score[t],
            )
            for t in self.type2s_diff
        ]
        # Reverse sorting to get highest scores, then reverse top n for plotting
        type_scores = sorted(type_scores, key=lambda x: abs(x[-1]), reverse=True)[
            :top_n
        ]
        type_scores.reverse()

        # Get bar heights and colors
        bar_dims = plotting.get_bar_dims(type_scores, self.norm, kwargs)
        bar_colors = plotting.get_bar_colors(type_scores, kwargs)

        # Initialize plot
        if ax is None:
            _, ax = plt.subplots(figsize=(kwargs["width"], kwargs["height"]))
        ax.margins(kwargs["y_margin"])
        # Plot type contributions
        ax = plotting.plot_contributions(ax, top_n, bar_dims, bar_colors, kwargs)
        # Plot total sum contributions
        total_comp_sums = self.get_shift_component_sums()
        bar_order = plotting.get_bar_order(kwargs)
        ax, comp_bar_heights, bar_order = plotting.plot_total_contribution_sums(
            ax, total_comp_sums, bar_order, top_n, bar_dims, kwargs
        )
        # Get labels for bars
        type_labels = [t for (t, _, _, _, _, _) in type_scores]
        # Add indicator if type borrowed a score
        m_sym = kwargs["missing_symbol"]
        type_labels = [
            t + m_sym if t in self.adopted_score_types else t for t in type_labels
        ]
        # Get labels for total contribution bars
        bar_labels = [kwargs["symbols"][b] for b in bar_order]
        labels = type_labels + bar_labels
        # Set font type
        if kwargs["serif"]:
            plotting.set_serif()
        # Set labels
        if kwargs["detailed"]:
            ax = plotting.set_bar_labels(
                ax, top_n, labels, bar_dims["label_heights"], comp_bar_heights, kwargs,
            )
        else:
            ax = plotting.set_bar_labels(
                ax, top_n, labels, bar_dims["total_heights"], comp_bar_heights, kwargs,
            )

        # Add center dividing line
        ax.axvline(0, ls="-", color="black", lw=1.0, zorder=20)

        # Add dividing line between types and component bars
        ax.axhline(top_n + 1, ls="-", color="black", lw=0.7, zorder=20)
        if kwargs["show_total"]:
            ax.axhline(top_n + 2.75, ls="-", color="black", lw=0.5, zorder=20)

        # Set insets
        if cumulative_inset:
            plotting.get_cumulative_inset(
                ax.figure, self.type2shift_score, top_n, self.normalization, kwargs
            )
        if text_size_inset:
            plotting.get_text_size_inset(
                ax.figure, self.type2freq_1, self.type2freq_2, kwargs
            )

        # Make x-tick labels bigger, flip y-axis ticks and label every 5th one
        ax = plotting.set_ticks(ax, top_n, kwargs)

        # Set axis spines
        ax = plotting.set_spines(ax, kwargs)

        # Set axis labels and title
        ax.set_xlabel(kwargs["xlabel"], fontsize=kwargs["xlabel_fontsize"])
        ax.set_ylabel(kwargs["ylabel"], fontsize=kwargs["ylabel_fontsize"])
        if "title" not in kwargs:
            s_avg_1 = self.get_weighted_score(self.type2freq_1, self.type2score_1)
            s_avg_2 = self.get_weighted_score(self.type2freq_2, self.type2score_2)
            title = (
                "{}: ".format(kwargs["system_names"][0])
                + r"$\Phi_{avg}=$"
                + "{0:.2f}".format(s_avg_1)
                + "\n"
                + "{}: ".format(kwargs["system_names"][1])
                + r"$\Phi_{avg}=$"
                + "{0:.2f}".format(s_avg_2)
            )
            kwargs["title"] = title
        ax.set_title(kwargs["title"], fontsize=kwargs["title_fontsize"])
        # Show and return plot
        if kwargs["tight"]:
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                plt.tight_layout()
        if filename is not None:
            plt.savefig(filename, dpi=kwargs["dpi"])
        if show_plot:
            plt.show()
        return ax
Source code for shifterator.shifterator

Shifterator

Navigation

Related Topics