# ------------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------------
import pandas as pd                 # === CORE EDA ===
import numpy as np

import matplotlib.pyplot as plt     # === VISUALIZATION ===
import seaborn as sns
import plotly.express as px

import textwrap                     # === TEXT / Light NLP ===
import re
from collections import Counter
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer    # === MACHINE LEARNING ===
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from tabulate import tabulate       # === ENHANCED DISPLAY ===
from rich import print as rprint

plt.style.use("default")            # === DISPLAY SETTING ===
sns.set_theme()

# ------------------------------------------------------------------
# 2. Convert Markdown (.md) to clean CSV
# ------------------------------------------------------------------
md_file = '../../data/raw/dark_stage_raw_dataset.md'

df = pd.read_csv(md_file, sep='|', skiprows=[1], engine='python')       # Read Markdown as table

# Remove ghost columns (Unnamed and fully empty)
df = df.loc[:, ~df.columns.str.contains("^Unnamed")]       # Remove Unnamed columns
df = df.dropna(axis=1, how='all')                          # Remove fully empty columns

df.columns = df.columns.str.strip()                                     # Strip whitespace from column names

text_cols = df.select_dtypes(include="object").columns                  # Strip leading/trailing spaces from all text columns
df[text_cols] = df[text_cols].apply(lambda col: col.str.strip())

df = df.drop(columns=["style"], errors="ignore")                        # Drop 'style' column (all 'blank-verse')

df.to_csv('../../data/raw/dark_stage_clean.csv', index=False)        


# ------------------------------------------------------------------
# 3. Load cleaned CSV 
# ------------------------------------------------------------------
df_dark_stage = pd.read_csv("../../data/raw/dark_stage_clean.csv")

# ------------------------------------------------------------------
# 4. Quick overview
# ------------------------------------------------------------------
print(df_dark_stage.shape)

(80, 15)

print(df_dark_stage.columns)

Index(['period', 'author_id', 'influence', 'death_year', 'incident_type',
       'anecdote', 'feud_with', 'notable_rivalry', 'intensity', 'sentiment',
       'stage_mood', 'location', 'play_id', 'creature_id', 'notes'],
      dtype='object')

print(df_dark_stage.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   period           80 non-null     object
 1   author_id        80 non-null     object
 2   influence        80 non-null     object
 3   death_year       80 non-null     int64 
 4   incident_type    80 non-null     object
 5   anecdote         80 non-null     object
 6   feud_with        77 non-null     object
 7   notable_rivalry  80 non-null     object
 8   intensity        80 non-null     object
 9   sentiment        80 non-null     object
 10  stage_mood       80 non-null     object
 11  location         80 non-null     object
 12  play_id          80 non-null     object
 13  creature_id      80 non-null     object
 14  notes            80 non-null     object
dtypes: int64(1), object(14)
memory usage: 9.5+ KB
None

print(df_dark_stage.head())

        period            author_id influence  death_year    incident_type  \
0  Elizabethan  william_shakespeare    global        1616    literary_feud   
1  Elizabethan  william_shakespeare    global        1616  social_snobbery   
2  Elizabethan  william_shakespeare    global        1616       plagiarism   
3  Elizabethan  william_shakespeare    global        1616   witty_repartee   
4  Elizabethan  christopher_marlowe    global        1593        espionage   

                                            anecdote  \
0  Robert Greene called him an “upstart crow” and...   
1  Contemporaries whispered he wasn’t classy enou...   
2  Some accused Shakespeare of letting others gho...   
3  He slipped satirical digs at rivals like Jonso...   
4  Rumored to be a spy, an atheist, and “wild”; s...   

                  feud_with                 notable_rivalry intensity  \
0             Robert Greene           Greene’s pamphlet war   notable   
1             Fellow actors           Shakespeare’s elitism     minor   
2          Anonymous rivals        Shakespeare’s authorship   notable   
3  Ben Jonson, John Marston  Shakespeare’s theatrical shade      epic   
4       Authorities, rivals      Marlowe’s mysterious death   notable   

     sentiment stage_mood     location play_id creature_id  \
0     jealousy          🤫       London    none        none   
1       grudge          😏  Royal Court    none        none   
2  humiliation          🤫       London    none        none   
3      cunning          😈      Theatre    none        none   
4    suspicion          🤔       London    none        none   

                            notes  
0           Greene’s pamphlet war  
1           Shakespeare’s elitism  
2        Shakespeare’s authorship  
3  Shakespeare’s theatrical shade  
4      Marlowe’s mysterious death

df.head()

# ------------------------------------------------------------------
# Playwrights 🎭
# ------------------------------------------------------------------
pivot_intensity = pd.crosstab(df_dark_stage['author_id'], df_dark_stage['intensity'])
pivot_intensity = pivot_intensity.div(pivot_intensity.sum(axis=1), axis=0)                 # convert to proportions

pivot_intensity.plot(kind='bar', stacked=True, figsize=(12, 4), colormap='cividis')

plt.title("DRAMA INTENSITY per AUTHOR")
plt.ylabel("")               # no numeric label
plt.xlabel("Author")

plt.yticks([])               # remove tick labels entirely
plt.xticks(rotation=45, ha='right')
plt.show()

weights = {"minor": 1, "notable": 2, "epic": 4}

df_dark_stage["feud_score"] = (df_dark_stage["intensity"].str.lower().map(weights)* df_dark_stage["feud_with"].notna().astype(int))

ranking = (df_dark_stage.groupby("author_id")["feud_score"].sum().sort_values(ascending=False))

print(ranking.head(10))

author_id
ben_jonson             12
christopher_marlowe    10
george_chapman          9
william_shakespeare     9
thomas_middleton        8
john_webster            8
philip_massinger        8
thomas_nashe            7
john_ford               7
thomas_dekker           7
Name: feud_score, dtype: int64

epic = df_dark_stage[df_dark_stage["intensity"].str.lower() == "epic"]

pie_data = epic["author_id"].value_counts()

colors = [
    "#4b006e",  # deep purple
    "#6a0dad",  # royal purple
    "#8a2be2",  # blue-violet
    "#9b5fc0",  # amethyst
]

pie_data.plot(kind="pie", figsize=(8,8), autopct="%1.1f%%", colors=colors[:len(pie_data)])

plt.title("WHO IS INVOLVED IN THE MOST EPIC FEUDS ?")
plt.ylabel("")
plt.show()

import networkx as nx

feuds_df = df_dark_stage.dropna(subset=['feud_with'])

G = nx.Graph()

for _, row in feuds_df.iterrows():                           # Add edges (author ↔ feud_with)
    G.add_edge(row['author_id'], row['feud_with'])

plt.figure(figsize=(12,8))                                   # Draw graph
pos = nx.spring_layout(G, seed=42) 
nx.draw(G, pos, with_labels=True, node_color='orange', edge_color='darkred', node_size=1000, font_size=8, font_color='black', width=3)

plt.title("FEUD NETWORK, Who Feuds with Whom")
plt.show()

feuds_df = df_dark_stage.dropna(subset=['feud_with'])
feuds_df = feuds_df[feuds_df['intensity'].str.lower() == "epic"]          # Only 'epic' intensity + valid feud_with

top_fighters = (feuds_df['author_id'].value_counts().head(10).index)
feuds_filtered = feuds_df[feuds_df['author_id'].isin(top_fighters)]       # Keep only rows where author_id is among top fighters

G = nx.Graph()

for _, row in feuds_filtered.iterrows():
    G.add_edge(row['author_id'], row['feud_with'])

plt.figure(figsize=(10,6))
pos = nx.spring_layout(G, seed=42)

nx.draw(G, pos, with_labels=True, node_color='orange', edge_color='darkred', node_size=2000, font_size=9, font_color='black', width=3)

plt.title("EPIC FEUD NETWORK")
plt.show()

# ------------------------------------------------------------------
# Sentiment analysis
# ------------------------------------------------------------------
df_dark_stage['sentiment'].value_counts()

sentiment
jealousy             32
mischief              9
mockery               8
embarrassment         5
shock                 5
frustration           4
regret                4
anger                 3
suspicion             2
humiliation           1
grudge                1
cunning               1
outrage               1
pride                 1
intrigue              1
fear                  1
romantic_intrigue     1
Name: count, dtype: int64

plt.figure(figsize=(10,5))

sns.countplot(data=df_dark_stage, y='sentiment', order=df_dark_stage['sentiment'].value_counts().index,
    hue='sentiment', palette=sns.color_palette("mako_r", n_colors=df_dark_stage['sentiment'].nunique()), dodge=False)

plt.title("DISTRIBUTION of SENTIMENTS")
plt.xlabel("Nber of incidents")
plt.ylabel("Sentiment")
plt.show()

sentiment_author = pd.crosstab(df_dark_stage['author_id'], df_dark_stage['sentiment'])

sentiment_author.plot(kind='bar', stacked=True, figsize=(16,6), colormap='mako_r')

plt.title("CROSS-TAB AUTHOR x SENTIMENT")
plt.xlabel("Author")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()

sentiment_incident = pd.crosstab(df_dark_stage['incident_type'], df_dark_stage['sentiment'])

sentiment_incident.plot(kind='bar', stacked=True, figsize=(10,5), colormap='mako_r')

plt.title("CROSS-TAB of SENTIMENT x INCIDENT TYPE")
plt.xlabel("Incident Type")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()

# ------------------------------------------------------------------
# Period 👑
# ------------------------------------------------------------------
sentiment_period = pd.crosstab(df_dark_stage['period'], df_dark_stage['sentiment'])

sentiment_period.plot(kind='bar', stacked=True, figsize=(10,6), colormap='mako_r')

plt.title("CROSS-TAB PERIOD x SENTIMENT (Elizabethan vs Jacobean)")
plt.xlabel(" ")
plt.ylabel("Nber of incidents")
plt.legend(title='Sentiment', bbox_to_anchor=(1.05, 1))
plt.xticks(rotation=45, ha='right')
plt.show()

plt.figure(figsize=(12,5))
sns.countplot(data=df_dark_stage, x='incident_type', hue='period', palette='mako_r')

plt.title("INCIDENT TYPES by PERIOD")
plt.xlabel(" ")
plt.ylabel("Nber of incidents")
plt.legend(title="Period")
plt.xticks(rotation=45, ha='right')
plt.show()

# ------------------------------------------------------------------
# Incidents & Locations
# ------------------------------------------------------------------
df_dark_stage['incident_type'].nunique()

24

df_dark_stage['incident_type'].unique()

array(['literary_feud', 'social_snobbery', 'plagiarism', 'witty_repartee',
       'espionage', 'violent_death', 'authorship', 'violent_feud',
       'legal_scandal', 'literary_rivalry', 'poverty', 'literary_shade',
       'literary_snobbery', 'court_intrigue', 'personal_gossip',
       'scandalous_death', 'financial_scandal', 'satire', 'gossip',
       'personal_scandal', 'collaboration_scandal', 'creative_jealousy',
       'literary_scandal', 'sabotage'], dtype=object)

# Group by incident_type and count occurrences
incident_counts = df_dark_stage['incident_type'].value_counts().sort_values(ascending=False)

plt.figure(figsize=(8,6))
incident_counts.plot(
kind='barh', color=plt.cm.plasma(np.linspace(0,1,len(incident_counts)))         # color map        
)   
plt.xlabel("Count of Incidents")
plt.ylabel("Incident Type")
plt.title("DISTRIBUTION of INCIDENT TYPES")
plt.gca().invert_yaxis()  
plt.show()

location_counts = df_dark_stage['location'].value_counts()             

plt.figure(figsize=(8,4))
location_counts.plot(kind='bar', color=plt.cm.viridis(np.linspace(0,1,len(location_counts))))

plt.xlabel("Nber of Incidents")
plt.ylabel("Location")
plt.title("INCIDENT COUNT by LOCATION")
plt.show()

theatre_incidents = df_dark_stage[df_dark_stage['location'] == 'Theatre']
theatre_authors = theatre_incidents['author_id'].unique()
print("AUTHORS INVOLVED in THEATRE INCIDENTS :", theatre_authors)

AUTHORS INVOLVED in THEATRE INCIDENTS : ['william_shakespeare' 'ben_jonson' 'thomas_kyd' 'thomas_dekker'
 'john_fletcher' 'john_ford' 'john_marston' 'john_webster' 'james_shirley'
 'philip_massinger' 'thomas_heywood' 'william_rowley']

tavern_incidents = df_dark_stage[df_dark_stage['location'] == 'Tavern']
theatre_authors = tavern_incidents['author_id'].unique()
print("AUTHORS INVOLVED in TAVERN INCIDENTS :", theatre_authors)

AUTHORS INVOLVED in TAVERN INCIDENTS : ['christopher_marlowe']

import matplotlib.ticker as mtick       # Provides tools to format and control axis tick marks and labels in Matplotlib

g = sns.FacetGrid(df_dark_stage, col="location", col_wrap=3, height=4, sharey=False)       # Create FacetGrid : one subplot per location

g.map_dataframe(sns.countplot, x="incident_type", hue="incident_type", palette="tab20", legend=False)   # Plot countplot in each subplot

g.set_xticklabels(rotation=45)                        # Rotate x-axis labels and set titles
g.set_axis_labels("Incident Type", "Count")
g.set_titles(col_template="{col_name}")

for ax in g.axes.flat:                                # Round y-axis labels to 1 decimal
    ax.yaxis.set_major_formatter(mtick.FormatStrFormatter('%.1f'))
     
    for p in ax.patches:                              # Add counts on top of each bar
        height = p.get_height()
        ax.text(x=p.get_x() + p.get_width()/2, y=height + 0.1, s=f"{int(height)}", ha='center')

plt.tight_layout()
plt.show()

print("LONDON")
print(df_dark_stage[df_dark_stage['location'] == 'London']['incident_type'].value_counts())
print()

LONDON
incident_type
literary_feud            23
gossip                    6
plagiarism                5
violent_feud              2
espionage                 2
personal_scandal          2
financial_scandal         2
scandalous_death          2
legal_scandal             2
literary_shade            1
poverty                   1
literary_rivalry          1
authorship                1
collaboration_scandal     1
creative_jealousy         1
satire                    1
sabotage                  1
Name: count, dtype: int64

print("ROYAL COURT")
print(df_dark_stage[df_dark_stage['location'] == 'Royal Court']['incident_type'].value_counts())
print()

ROYAL COURT
incident_type
social_snobbery    1
Name: count, dtype: int64

print("THEATRE")
print(df_dark_stage[df_dark_stage['location'] == 'Theatre']['incident_type'].value_counts())
print()

THEATRE
incident_type
satire              6
literary_scandal    5
literary_feud       3
plagiarism          2
witty_repartee      1
legal_scandal       1
Name: count, dtype: int64

print("TAVERN")
print(df_dark_stage[df_dark_stage['location'] == 'Tavern']['incident_type'].value_counts())
print()

TAVERN
incident_type
violent_death    1
Name: count, dtype: int64

print("COURT")
print(df_dark_stage[df_dark_stage['location'] == 'Court']['incident_type'].value_counts())
print()

COURT
incident_type
court_intrigue       2
personal_scandal     2
literary_snobbery    1
personal_gossip      1
Name: count, dtype: int64

total_counts = df_dark_stage['incident_type'].value_counts()
print("TOTAL")
print(total_counts)

TOTAL
incident_type
literary_feud            26
plagiarism                7
satire                    7
gossip                    6
literary_scandal          5
personal_scandal          4
legal_scandal             3
violent_feud              2
espionage                 2
court_intrigue            2
scandalous_death          2
financial_scandal         2
authorship                1
violent_death             1
social_snobbery           1
witty_repartee            1
personal_gossip           1
literary_snobbery         1
poverty                   1
literary_shade            1
literary_rivalry          1
collaboration_scandal     1
creative_jealousy         1
sabotage                  1
Name: count, dtype: int64

ToGitOrNotToGit 💀¶

EDA 🗣️ `dark_stage` dataset¶

🗣️ Columns Guide (15 columns)¶

	period	author_id	influence	death_year	incident_type	anecdote	feud_with	notable_rivalry	intensity	sentiment	stage_mood	location	play_id	creature_id	notes
0	Elizabethan	william_shakespeare	global	1616	literary_feud	Robert Greene called him an “upstart crow” and...	Robert Greene	Greene’s pamphlet war	notable	jealousy	🤫	London	none	none	Greene’s pamphlet war
1	Elizabethan	william_shakespeare	global	1616	social_snobbery	Contemporaries whispered he wasn’t classy enou...	Fellow actors	Shakespeare’s elitism	minor	grudge	😏	Royal Court	none	none	Shakespeare’s elitism
2	Elizabethan	william_shakespeare	global	1616	plagiarism	Some accused Shakespeare of letting others gho...	Anonymous rivals	Shakespeare’s authorship	notable	humiliation	🤫	London	none	none	Shakespeare’s authorship
3	Elizabethan	william_shakespeare	global	1616	witty_repartee	He slipped satirical digs at rivals like Jonso...	Ben Jonson, John Marston	Shakespeare’s theatrical shade	epic	cunning	😈	Theatre	none	none	Shakespeare’s theatrical shade
4	Elizabethan	christopher_marlowe	global	1593	espionage	Rumored to be a spy, an atheist, and “wild”; s...	Authorities, rivals	Marlowe’s mysterious death	notable	suspicion	🤔	London	none	none	Marlowe’s mysterious death

ToGitOrNotToGit 💀¶

EDA 🗣️ dark_stage dataset¶

🗣️ Columns Guide (15 columns)¶

EDA 🗣️ `dark_stage` dataset¶