In [1]:
# ------------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------------
import pandas as pd # === CORE EDA ===
import numpy as np
import matplotlib.pyplot as plt # === VISUALIZATION ===
import seaborn as sns
import plotly.express as px
import textwrap # === TEXT / Light NLP ===
import re
from collections import Counter
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer # === MACHINE LEARNING ===
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from tabulate import tabulate # === ENHANCED DISPLAY ===
from rich import print as rprint
plt.style.use("default") # === DISPLAY SETTING ===
sns.set_theme()
In [2]:
# ------------------------------------------------------------------
# 2. Convert Markdown (.md) to clean CSV
# ------------------------------------------------------------------
md_file = '../../data/raw/dark_stage_raw_dataset.md'
df = pd.read_csv(md_file, sep='|', skiprows=[1], engine='python') # Read Markdown as table
# Remove ghost columns (Unnamed and fully empty)
df = df.loc[:, ~df.columns.str.contains("^Unnamed")] # Remove Unnamed columns
df = df.dropna(axis=1, how='all') # Remove fully empty columns
df.columns = df.columns.str.strip() # Strip whitespace from column names
text_cols = df.select_dtypes(include="object").columns # Strip leading/trailing spaces from all text columns
df[text_cols] = df[text_cols].apply(lambda col: col.str.strip())
df = df.drop(columns=["style"], errors="ignore") # Drop 'style' column (all 'blank-verse')
df.to_csv('../../data/raw/dark_stage_clean.csv', index=False)
# ------------------------------------------------------------------
# 3. Load cleaned CSV
# ------------------------------------------------------------------
df_dark_stage = pd.read_csv("../../data/raw/dark_stage_clean.csv")
In [3]:
# ------------------------------------------------------------------
# 4. Quick overview
# ------------------------------------------------------------------
print(df_dark_stage.shape)
(80, 15)
In [4]:
print(df_dark_stage.columns)
Index(['period', 'author_id', 'influence', 'death_year', 'incident_type',
'anecdote', 'feud_with', 'notable_rivalry', 'intensity', 'sentiment',
'stage_mood', 'location', 'play_id', 'creature_id', 'notes'],
dtype='object')
In [5]:
print(df_dark_stage.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 80 entries, 0 to 79 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 period 80 non-null object 1 author_id 80 non-null object 2 influence 80 non-null object 3 death_year 80 non-null int64 4 incident_type 80 non-null object 5 anecdote 80 non-null object 6 feud_with 77 non-null object 7 notable_rivalry 80 non-null object 8 intensity 80 non-null object 9 sentiment 80 non-null object 10 stage_mood 80 non-null object 11 location 80 non-null object 12 play_id 80 non-null object 13 creature_id 80 non-null object 14 notes 80 non-null object dtypes: int64(1), object(14) memory usage: 9.5+ KB None
In [6]:
print(df_dark_stage.head())
period author_id influence death_year incident_type \
0 Elizabethan william_shakespeare global 1616 literary_feud
1 Elizabethan william_shakespeare global 1616 social_snobbery
2 Elizabethan william_shakespeare global 1616 plagiarism
3 Elizabethan william_shakespeare global 1616 witty_repartee
4 Elizabethan christopher_marlowe global 1593 espionage
anecdote \
0 Robert Greene called him an “upstart crow” and...
1 Contemporaries whispered he wasn’t classy enou...
2 Some accused Shakespeare of letting others gho...
3 He slipped satirical digs at rivals like Jonso...
4 Rumored to be a spy, an atheist, and “wild”; s...
feud_with notable_rivalry intensity \
0 Robert Greene Greene’s pamphlet war notable
1 Fellow actors Shakespeare’s elitism minor
2 Anonymous rivals Shakespeare’s authorship notable
3 Ben Jonson, John Marston Shakespeare’s theatrical shade epic
4 Authorities, rivals Marlowe’s mysterious death notable
sentiment stage_mood location play_id creature_id \
0 jealousy 🤫 London none none
1 grudge 😏 Royal Court none none
2 humiliation 🤫 London none none
3 cunning 😈 Theatre none none
4 suspicion 🤔 London none none
notes
0 Greene’s pamphlet war
1 Shakespeare’s elitism
2 Shakespeare’s authorship
3 Shakespeare’s theatrical shade
4 Marlowe’s mysterious death
In [7]:
df.head()
Out[7]:
| period | author_id | influence | death_year | incident_type | anecdote | feud_with | notable_rivalry | intensity | sentiment | stage_mood | location | play_id | creature_id | notes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Elizabethan | william_shakespeare | global | 1616 | literary_feud | Robert Greene called him an “upstart crow” and... | Robert Greene | Greene’s pamphlet war | notable | jealousy | 🤫 | London | none | none | Greene’s pamphlet war |
| 1 | Elizabethan | william_shakespeare | global | 1616 | social_snobbery | Contemporaries whispered he wasn’t classy enou... | Fellow actors | Shakespeare’s elitism | minor | grudge | 😏 | Royal Court | none | none | Shakespeare’s elitism |
| 2 | Elizabethan | william_shakespeare | global | 1616 | plagiarism | Some accused Shakespeare of letting others gho... | Anonymous rivals | Shakespeare’s authorship | notable | humiliation | 🤫 | London | none | none | Shakespeare’s authorship |
| 3 | Elizabethan | william_shakespeare | global | 1616 | witty_repartee | He slipped satirical digs at rivals like Jonso... | Ben Jonson, John Marston | Shakespeare’s theatrical shade | epic | cunning | 😈 | Theatre | none | none | Shakespeare’s theatrical shade |
| 4 | Elizabethan | christopher_marlowe | global | 1593 | espionage | Rumored to be a spy, an atheist, and “wild”; s... | Authorities, rivals | Marlowe’s mysterious death | notable | suspicion | 🤔 | London | none | none | Marlowe’s mysterious death |
🗣️ Columns Guide (15 columns)¶
=== CORE IDENTIFIERS ===
- author_id : the dramatist involved, linked to
creators.csv - play_id : the play concerned (if applicable)
- creature_id : the character involved, unwitting actor in the backstage drama
=== EVENT & STORY DIMENSIONS ===
- incident_type : nature of chaos (Rivalry, Duel, Censorship, Collaboration, Scandal, Witty Repartée…)
- anecdote : the juicy narrative (Duels, Betrayals, Quips, Literary Feuds…)
- intensity : scale of impact, measuring drama magnitude (Minor, Notable, Epic, etc.)
- sentiment : emotional or moral undertone (jealousy, grudge, admiration, humiliation, cunning, etc.)
- stage_mood : expressive emoji representing the backstage atmosphere (😏 😡 🤫 😈 🤔…)
=== CONTEXTUAL INFORMATION ===
- period : time indication (historical period or approximate year/range)
- location : where the incident occurred, Theatre, Tavern, Royal Court, London, etc.
- notes : sources, critical references, metadata, commentary backing up the tale
=== ADDITIONAL LITERARY METADATA ===
- influence : cultural/literary reach : national, global, etc.
- death_year : year of death of the dramatist (for timeline alignment)
- feud_with : person(s) or group involved in the feud : rivals, actors, critics, authorities
- notable_rivalry : named rivalry or controversy associated with the anecdote
In [8]:
# ------------------------------------------------------------------
# Playwrights 🎭
# ------------------------------------------------------------------
pivot_intensity = pd.crosstab(df_dark_stage['author_id'], df_dark_stage['intensity'])
pivot_intensity = pivot_intensity.div(pivot_intensity.sum(axis=1), axis=0) # convert to proportions
pivot_intensity.plot(kind='bar', stacked=True, figsize=(12, 4), colormap='cividis')
plt.title("DRAMA INTENSITY per AUTHOR")
plt.ylabel("") # no numeric label
plt.xlabel("Author")
plt.yticks([]) # remove tick labels entirely
plt.xticks(rotation=45, ha='right')
plt.show()
In [9]:
weights = {"minor": 1, "notable": 2, "epic": 4}
df_dark_stage["feud_score"] = (df_dark_stage["intensity"].str.lower().map(weights)* df_dark_stage["feud_with"].notna().astype(int))
ranking = (df_dark_stage.groupby("author_id")["feud_score"].sum().sort_values(ascending=False))
print(ranking.head(10))
author_id ben_jonson 12 christopher_marlowe 10 george_chapman 9 william_shakespeare 9 thomas_middleton 8 john_webster 8 philip_massinger 8 thomas_nashe 7 john_ford 7 thomas_dekker 7 Name: feud_score, dtype: int64
In [10]:
epic = df_dark_stage[df_dark_stage["intensity"].str.lower() == "epic"]
pie_data = epic["author_id"].value_counts()
colors = [
"#4b006e", # deep purple
"#6a0dad", # royal purple
"#8a2be2", # blue-violet
"#9b5fc0", # amethyst
]
pie_data.plot(kind="pie", figsize=(8,8), autopct="%1.1f%%", colors=colors[:len(pie_data)])
plt.title("WHO IS INVOLVED IN THE MOST EPIC FEUDS ?")
plt.ylabel("")
plt.show()
In [11]:
import networkx as nx
feuds_df = df_dark_stage.dropna(subset=['feud_with'])
G = nx.Graph()
for _, row in feuds_df.iterrows(): # Add edges (author ↔ feud_with)
G.add_edge(row['author_id'], row['feud_with'])
plt.figure(figsize=(12,8)) # Draw graph
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='orange', edge_color='darkred', node_size=1000, font_size=8, font_color='black', width=3)
plt.title("FEUD NETWORK, Who Feuds with Whom")
plt.show()
In [12]:
feuds_df = df_dark_stage.dropna(subset=['feud_with'])
feuds_df = feuds_df[feuds_df['intensity'].str.lower() == "epic"] # Only 'epic' intensity + valid feud_with
top_fighters = (feuds_df['author_id'].value_counts().head(10).index)
feuds_filtered = feuds_df[feuds_df['author_id'].isin(top_fighters)] # Keep only rows where author_id is among top fighters
G = nx.Graph()
for _, row in feuds_filtered.iterrows():
G.add_edge(row['author_id'], row['feud_with'])
plt.figure(figsize=(10,6))
pos = nx.spring_layout(G, seed=42)
nx.draw(G, pos, with_labels=True, node_color='orange', edge_color='darkred', node_size=2000, font_size=9, font_color='black', width=3)
plt.title("EPIC FEUD NETWORK")
plt.show()
In [13]:
# ------------------------------------------------------------------
# Sentiment analysis
# ------------------------------------------------------------------
df_dark_stage['sentiment'].value_counts()
Out[13]:
sentiment jealousy 32 mischief 9 mockery 8 embarrassment 5 shock 5 frustration 4 regret 4 anger 3 suspicion 2 humiliation 1 grudge 1 cunning 1 outrage 1 pride 1 intrigue 1 fear 1 romantic_intrigue 1 Name: count, dtype: int64
In [14]:
plt.figure(figsize=(10,5))
sns.countplot(data=df_dark_stage, y='sentiment', order=df_dark_stage['sentiment'].value_counts().index,
hue='sentiment', palette=sns.color_palette("mako_r", n_colors=df_dark_stage['sentiment'].nunique()), dodge=False)
plt.title("DISTRIBUTION of SENTIMENTS")
plt.xlabel("Nber of incidents")
plt.ylabel("Sentiment")
plt.show()
In [15]:
sentiment_author = pd.crosstab(df_dark_stage['author_id'], df_dark_stage['sentiment'])
sentiment_author.plot(kind='bar', stacked=True, figsize=(16,6), colormap='mako_r')
plt.title("CROSS-TAB AUTHOR x SENTIMENT")
plt.xlabel("Author")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()
In [16]:
sentiment_incident = pd.crosstab(df_dark_stage['incident_type'], df_dark_stage['sentiment'])
sentiment_incident.plot(kind='bar', stacked=True, figsize=(10,5), colormap='mako_r')
plt.title("CROSS-TAB of SENTIMENT x INCIDENT TYPE")
plt.xlabel("Incident Type")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()
In [17]:
# ------------------------------------------------------------------
# Period 👑
# ------------------------------------------------------------------
sentiment_period = pd.crosstab(df_dark_stage['period'], df_dark_stage['sentiment'])
sentiment_period.plot(kind='bar', stacked=True, figsize=(10,6), colormap='mako_r')
plt.title("CROSS-TAB PERIOD x SENTIMENT (Elizabethan vs Jacobean)")
plt.xlabel(" ")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()
In [18]:
plt.figure(figsize=(12,5))
sns.countplot(data=df_dark_stage, x='incident_type', hue='period', palette='mako_r')
plt.title("INCIDENT TYPES by PERIOD")
plt.xlabel(" ")
plt.ylabel("Nber of incidents")
plt.legend(title="Period")
plt.xticks(rotation=45, ha='right')
plt.show()
In [19]:
# ------------------------------------------------------------------
# Incidents & Locations
# ------------------------------------------------------------------
df_dark_stage['incident_type'].nunique()
Out[19]:
24
In [20]:
df_dark_stage['incident_type'].unique()
Out[20]:
array(['literary_feud', 'social_snobbery', 'plagiarism', 'witty_repartee',
'espionage', 'violent_death', 'authorship', 'violent_feud',
'legal_scandal', 'literary_rivalry', 'poverty', 'literary_shade',
'literary_snobbery', 'court_intrigue', 'personal_gossip',
'scandalous_death', 'financial_scandal', 'satire', 'gossip',
'personal_scandal', 'collaboration_scandal', 'creative_jealousy',
'literary_scandal', 'sabotage'], dtype=object)
In [21]:
# Group by incident_type and count occurrences
incident_counts = df_dark_stage['incident_type'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(8,6))
incident_counts.plot(
kind='barh', color=plt.cm.plasma(np.linspace(0,1,len(incident_counts))) # color map
)
plt.xlabel("Count of Incidents")
plt.ylabel("Incident Type")
plt.title("DISTRIBUTION of INCIDENT TYPES")
plt.gca().invert_yaxis()
plt.show()
In [22]:
location_counts = df_dark_stage['location'].value_counts()
plt.figure(figsize=(8,4))
location_counts.plot(kind='bar', color=plt.cm.viridis(np.linspace(0,1,len(location_counts))))
plt.xlabel("Nber of Incidents")
plt.ylabel("Location")
plt.title("INCIDENT COUNT by LOCATION")
plt.show()
In [23]:
theatre_incidents = df_dark_stage[df_dark_stage['location'] == 'Theatre']
theatre_authors = theatre_incidents['author_id'].unique()
print("AUTHORS INVOLVED in THEATRE INCIDENTS :", theatre_authors)
AUTHORS INVOLVED in THEATRE INCIDENTS : ['william_shakespeare' 'ben_jonson' 'thomas_kyd' 'thomas_dekker' 'john_fletcher' 'john_ford' 'john_marston' 'john_webster' 'james_shirley' 'philip_massinger' 'thomas_heywood' 'william_rowley']
In [24]:
tavern_incidents = df_dark_stage[df_dark_stage['location'] == 'Tavern']
theatre_authors = tavern_incidents['author_id'].unique()
print("AUTHORS INVOLVED in TAVERN INCIDENTS :", theatre_authors)
AUTHORS INVOLVED in TAVERN INCIDENTS : ['christopher_marlowe']
In [25]:
import matplotlib.ticker as mtick # Provides tools to format and control axis tick marks and labels in Matplotlib
g = sns.FacetGrid(df_dark_stage, col="location", col_wrap=3, height=4, sharey=False) # Create FacetGrid : one subplot per location
g.map_dataframe(sns.countplot, x="incident_type", hue="incident_type", palette="tab20", legend=False) # Plot countplot in each subplot
g.set_xticklabels(rotation=45) # Rotate x-axis labels and set titles
g.set_axis_labels("Incident Type", "Count")
g.set_titles(col_template="{col_name}")
for ax in g.axes.flat: # Round y-axis labels to 1 decimal
ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f'))
for p in ax.patches: # Add counts on top of each bar
height = p.get_height()
ax.text(x=p.get_x() + p.get_width()/2, y=height + 0.1, s=f"{int(height)}", ha='center')
plt.tight_layout()
plt.show()
In [26]:
print("LONDON")
print(df_dark_stage[df_dark_stage['location'] == 'London']['incident_type'].value_counts())
print()
LONDON incident_type literary_feud 23 gossip 6 plagiarism 5 violent_feud 2 espionage 2 personal_scandal 2 financial_scandal 2 scandalous_death 2 legal_scandal 2 literary_shade 1 poverty 1 literary_rivalry 1 authorship 1 collaboration_scandal 1 creative_jealousy 1 satire 1 sabotage 1 Name: count, dtype: int64
In [27]:
print("ROYAL COURT")
print(df_dark_stage[df_dark_stage['location'] == 'Royal Court']['incident_type'].value_counts())
print()
ROYAL COURT incident_type social_snobbery 1 Name: count, dtype: int64
In [28]:
print("THEATRE")
print(df_dark_stage[df_dark_stage['location'] == 'Theatre']['incident_type'].value_counts())
print()
THEATRE incident_type satire 6 literary_scandal 5 literary_feud 3 plagiarism 2 witty_repartee 1 legal_scandal 1 Name: count, dtype: int64
In [29]:
print("TAVERN")
print(df_dark_stage[df_dark_stage['location'] == 'Tavern']['incident_type'].value_counts())
print()
TAVERN incident_type violent_death 1 Name: count, dtype: int64
In [30]:
print("COURT")
print(df_dark_stage[df_dark_stage['location'] == 'Court']['incident_type'].value_counts())
print()
COURT incident_type court_intrigue 2 personal_scandal 2 literary_snobbery 1 personal_gossip 1 Name: count, dtype: int64
In [31]:
total_counts = df_dark_stage['incident_type'].value_counts()
print("TOTAL")
print(total_counts)
TOTAL incident_type literary_feud 26 plagiarism 7 satire 7 gossip 6 literary_scandal 5 personal_scandal 4 legal_scandal 3 violent_feud 2 espionage 2 court_intrigue 2 scandalous_death 2 financial_scandal 2 authorship 1 violent_death 1 social_snobbery 1 witty_repartee 1 personal_gossip 1 literary_snobbery 1 poverty 1 literary_shade 1 literary_rivalry 1 collaboration_scandal 1 creative_jealousy 1 sabotage 1 Name: count, dtype: int64