import tasknet as tn
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
[docs]
class IMDBAuthorshipClassification(tn.Classification):
"""
A classification task for authorship attribution using the IMDb-62 dataset.
Inherits from tasknet.Classification and automatically processes the dataset
to select authors with sufficient documents for classification.
"""
def __init__(self, n_authors: int = 10, min_docs_per_author: int = 1000,
random_seed: int = 0, **kwargs):
"""
Initialize the authorship classification task.
Args:
n_authors: Number of authors to include in the classification task
min_docs_per_author: Minimum number of documents required per author
random_seed: Random seed for reproducible author selection
**kwargs: Additional arguments passed to parent Classification class
"""
self.n_authors = n_authors
self.min_docs_per_author = min_docs_per_author
self.random_seed = random_seed
# Load and process the dataset
processed_data = self._load_and_process_dataset()
# Initialize parent class with processed data
super().__init__(
dataset=tn.utils.train_validation_test_split(processed_data),
s1="text",
y="labels",
**kwargs
)
# Set task name
self.name = f"imdb_authorship_{n_authors}_authors"
def _load_and_process_dataset(self) -> Dataset:
"""
Load IMDb-62 dataset and process it for authorship classification.
Returns:
Dataset: Processed dataset with selected authors and formatted columns
"""
# Load the IMDb-62 dataset
dataset = load_dataset("tasksource/imdb62")
df = pd.DataFrame(dataset["train"])
# Get author document counts
author_counts = df["userId"].value_counts()
# Select authors with sufficient documents
eligible_authors = author_counts[author_counts >= self.min_docs_per_author].index
if len(eligible_authors) < self.n_authors:
raise ValueError(
f"Only {len(eligible_authors)} authors have >= {self.min_docs_per_author} "
f"documents, but {self.n_authors} authors requested."
)
# Randomly select n_authors from eligible authors
rng = np.random.default_rng(self.random_seed)
selected_authors = rng.choice(
np.sort(eligible_authors),
size=self.n_authors,
replace=False
)
# Filter dataset to selected authors
filtered_df = df[df["userId"].isin(selected_authors)].dropna()
# Format for classification task
processed_df = pd.DataFrame({
"labels": filtered_df["userId"].values,
"text": filtered_df["content"].values,
})
# Convert to Dataset and encode labels
data = Dataset.from_dict(processed_df)
data = data.class_encode_column("labels")
return data
@property
def author_info(self) -> dict:
"""Get information about selected authors and their document counts."""
# Reconstruct the original data to get author info
dataset = load_dataset("tasksource/imdb62")
df = pd.DataFrame(dataset["train"])
author_counts = df["userId"].value_counts()
rng = np.random.default_rng(self.random_seed)
eligible_authors = author_counts[author_counts >= self.min_docs_per_author].index
selected_authors = rng.choice(np.sort(eligible_authors), size=self.n_authors, replace=False)
return {
"selected_authors": selected_authors.tolist(),
"author_doc_counts": {author: author_counts[author] for author in selected_authors},
"total_documents": sum(author_counts[author] for author in selected_authors)
}