Source code for tau_eval.tasks.imdb_authorship_classification

import tasknet as tn
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset


[docs]
class IMDBAuthorshipClassification(tn.Classification):
    """
    A classification task for authorship attribution using the IMDb-62 dataset.

    Inherits from tasknet.Classification and automatically processes the dataset
    to select authors with sufficient documents for classification.
    """

    def __init__(self, n_authors: int = 10, min_docs_per_author: int = 1000,
                 random_seed: int = 0, **kwargs):
        """
        Initialize the authorship classification task.

        Args:
            n_authors: Number of authors to include in the classification task
            min_docs_per_author: Minimum number of documents required per author
            random_seed: Random seed for reproducible author selection
            **kwargs: Additional arguments passed to parent Classification class
        """
        self.n_authors = n_authors
        self.min_docs_per_author = min_docs_per_author
        self.random_seed = random_seed

        # Load and process the dataset
        processed_data = self._load_and_process_dataset()

        # Initialize parent class with processed data
        super().__init__(
            dataset=tn.utils.train_validation_test_split(processed_data),
            s1="text",
            y="labels",
            **kwargs
        )

        # Set task name
        self.name = f"imdb_authorship_{n_authors}_authors"

    def _load_and_process_dataset(self) -> Dataset:
        """
        Load IMDb-62 dataset and process it for authorship classification.

        Returns:
            Dataset: Processed dataset with selected authors and formatted columns
        """
        # Load the IMDb-62 dataset
        dataset = load_dataset("tasksource/imdb62")
        df = pd.DataFrame(dataset["train"])

        # Get author document counts
        author_counts = df["userId"].value_counts()

        # Select authors with sufficient documents
        eligible_authors = author_counts[author_counts >= self.min_docs_per_author].index

        if len(eligible_authors) < self.n_authors:
            raise ValueError(
                f"Only {len(eligible_authors)} authors have >= {self.min_docs_per_author} "
                f"documents, but {self.n_authors} authors requested."
            )

        # Randomly select n_authors from eligible authors
        rng = np.random.default_rng(self.random_seed)
        selected_authors = rng.choice(
            np.sort(eligible_authors),
            size=self.n_authors,
            replace=False
        )

        # Filter dataset to selected authors
        filtered_df = df[df["userId"].isin(selected_authors)].dropna()

        # Format for classification task
        processed_df = pd.DataFrame({
            "labels": filtered_df["userId"].values,
            "text": filtered_df["content"].values,
        })

        # Convert to Dataset and encode labels
        data = Dataset.from_dict(processed_df)
        data = data.class_encode_column("labels")

        return data

    @property
    def author_info(self) -> dict:
        """Get information about selected authors and their document counts."""
        # Reconstruct the original data to get author info
        dataset = load_dataset("tasksource/imdb62")
        df = pd.DataFrame(dataset["train"])
        author_counts = df["userId"].value_counts()

        rng = np.random.default_rng(self.random_seed)
        eligible_authors = author_counts[author_counts >= self.min_docs_per_author].index
        selected_authors = rng.choice(np.sort(eligible_authors), size=self.n_authors, replace=False)

        return {
            "selected_authors": selected_authors.tolist(),
            "author_doc_counts": {author: author_counts[author] for author in selected_authors},
            "total_documents": sum(author_counts[author] for author in selected_authors)
        }