# ------------------------------------------------------------------
# 1. Import libraries
# ------------------------------------------------------------------
import pandas as pd                 # === CORE EDA ===
import numpy as np

import matplotlib.pyplot as plt     # === VISUALIZATION ===
import seaborn as sns
import plotly.express as px

import textwrap                     # === TEXT / Light NLP ===
import re
from collections import Counter
from wordcloud import WordCloud

from sklearn.feature_extraction.text import TfidfVectorizer    # === MACHINE LEARNING ===
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from tabulate import tabulate       # === ENHANCED DISPLAY ===
from rich import print as rprint

plt.style.use("default")            # === DISPLAY SETTING ===
sns.set_theme()


# ------------------------------------------------------------------
# 2. Load dataset
# ------------------------------------------------------------------
df_creators = pd.read_csv("../../data/raw/creators_raw_dataset.csv")

# ------------------------------------------------------------------
# 3. Quick overview
# ------------------------------------------------------------------
print("SHAPE :", df_creators.shape)

SHAPE : (20, 12)

df_creators.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Author            20 non-null     object
 1   Birth             20 non-null     int64 
 2   Death             20 non-null     int64 
 3   Birthplace        20 non-null     object
 4   Period            20 non-null     object
 5   Genre             20 non-null     object
 6   Major Works       20 non-null     object
 7   Style & Themes    20 non-null     object
 8   Iconic Creatures  20 non-null     object
 9   Visual Motifs     20 non-null     object
 10  Relevance         20 non-null     object
 11  Notes             20 non-null     object
dtypes: int64(2), object(10)
memory usage: 2.0+ KB

pd.options.display.float_format = '{:.2f}'.format
df_creators.describe(include="all")

df_creators.head()

# ------------------------------------------------------------------
# Distribution of Years (Birth & Death)
# ------------------------------------------------------------------
# === Birth and Death Year Distribution ===
plt.figure(figsize=(12,5))                                  

plt.subplot(1,2,1)                                          # Birth years histogram
sns.histplot(df_creators['Birth'], bins=10, kde=True, color='skyblue')
plt.title("BIRTH YEAR DISTRIBUTION")
plt.xlabel("Year")
plt.ylabel("Count")
      
plt.subplot(1,2,2)                                          # Death years histogram
sns.histplot(df_creators['Death'], bins=10, kde=True, color='salmon')
plt.title("DEATH YEAR DISTRIBUTION")
plt.xlabel("Year")
plt.ylabel("Count")

plt.tight_layout()
plt.show()

# === Calculate Age at Death ===
# Each author's lifespan, ordered chronologically by birth, to easily see generational patterns.

df_creators['Age'] = df_creators['Death'] - df_creators['Birth']


df_age_sorted = df_creators.sort_values('Birth')        # Sort by Birth year

plt.figure(figsize=(12,6))
plt.barh(df_age_sorted['Author'], df_age_sorted['Age'], color=plt.cm.viridis(np.linspace(0,1,len(df_age_sorted))))
plt.xlabel("Age")
plt.ylabel("Author")
plt.title("AUTHORS' AGE at DEATH (sorted by Birth Year)")
plt.gca().invert_yaxis()  # Oldest at top
plt.show()

# === Create sorted list with lifespan ===
df_age_sorted = df_creators[['Author', 'Birth', 'Death']].copy()
df_age_sorted['Age'] = df_age_sorted['Death'] - df_age_sorted['Birth']

df_age_sorted = df_age_sorted.sort_values('Birth')       # Sort by Birth year

display(df_age_sorted)

print("=== 3 YOUNGEST at DEATH ===")
display(df_age_sorted.nsmallest(3, 'Age'))

=== 3 YOUNGEST at DEATH ===

print("=== 3 OLDEST at DEATH ===")
display(df_age_sorted.nlargest(3, 'Age'))

=== 3 OLDEST at DEATH ===

# === Dramatic, literary print ===
for idx, row in df_age_sorted.iterrows():
    author = row['Author']
    birth = row['Birth']
    death = row['Death']
    age = row['Age']
    
    if age < 30:
        fate = "💀 Cut tragically short"
    elif age > 70:
        fate = "🌹 Blessed with longevity"
    else:
        fate = "🕯️ Lived a measured life"
    
    print(f"{author} ({birth}-{death}) / {age} / {fate}")

John Lyly (1554-1606) / 52 / 🕯️ Lived a measured life
George Peele (1556-1596) / 40 / 🕯️ Lived a measured life
Robert Greene (1558-1592) / 34 / 🕯️ Lived a measured life
Thomas Kyd (1558-1594) / 36 / 🕯️ Lived a measured life
George Chapman (1559-1634) / 75 / 🌹 Blessed with longevity
Christopher Marlowe (1564-1593) / 29 / 💀 Cut tragically short
William Shakespeare (1564-1616) / 52 / 🕯️ Lived a measured life
Thomas Nashe (1567-1601) / 34 / 🕯️ Lived a measured life
Thomas Dekker (1572-1632) / 60 / 🕯️ Lived a measured life
Ben Jonson (1572-1637) / 65 / 🕯️ Lived a measured life
Thomas Heywood (1574-1641) / 67 / 🕯️ Lived a measured life
John Marston (1576-1634) / 58 / 🕯️ Lived a measured life
John Fletcher (1579-1625) / 46 / 🕯️ Lived a measured life
Thomas Middleton (1580-1627) / 47 / 🕯️ Lived a measured life
John Webster (1580-1634) / 54 / 🕯️ Lived a measured life
Philip Massinger (1583-1640) / 57 / 🕯️ Lived a measured life
Francis Beaumont (1584-1616) / 32 / 🕯️ Lived a measured life
William Rowley (1585-1626) / 41 / 🕯️ Lived a measured life
John Ford (1586-1639) / 53 / 🕯️ Lived a measured life
James Shirley (1596-1666) / 70 / 🕯️ Lived a measured life

# === Sort authors by Birth and reset index ===
df_timeline = df_creators[['Author','Birth','Death']].copy()
df_timeline['Age'] = df_timeline['Death'] - df_timeline['Birth']
df_timeline = df_timeline.sort_values('Birth').reset_index(drop=True)

# === Plot Timeline ===
plt.figure(figsize=(14,8))

for idx, row in df_timeline.iterrows():
    plt.plot([row['Birth'], row['Death']], [idx, idx], color='darkred', linewidth=4)
    plt.scatter(row['Birth'], idx, color='green', s=50)  # Birth
    plt.scatter(row['Death'], idx, color='black', s=50)  # Death

plt.yticks(range(len(df_timeline)), df_timeline['Author'])
plt.xlabel("Year")
plt.title("AUTHORS TIMELINE : Birth → Death")
plt.gca().invert_yaxis()  # Earliest birth on top
plt.show()

# ------------------------------------------------------------------
# Periods & Genres
# ------------------------------------------------------------------
# === Split Periods and Genres into lists ===
df_creators['Period_list'] = df_creators['Period'].str.split(',\s*')
df_creators['Genre_list'] = df_creators['Genre'].str.split(',\s*')

from collections import Counter               # Count occurrences

import matplotlib as mpl
mpl.rcParams['font.family'] = 'Segoe UI Emoji'

# === Flatten all periods & genres for counting ===
all_periods = [p for sublist in df_creators['Period_list'] for p in sublist]
all_genres = [g for sublist in df_creators['Genre_list'] for g in sublist]

period_counts = Counter(all_periods)
genre_counts = Counter(all_genres)

# === Periods Bar Chart ===
plt.figure(figsize=(10,5))
colors = plt.cm.magma(np.linspace(0,1,len(period_counts)))
plt.bar(period_counts.keys(), period_counts.values(), color=colors)
plt.title("👑 AUTHORS by PERIOD (Elizabethan → Jacobean → Caroline)", fontsize=14)
plt.ylabel("Nber of Authors")
plt.xlabel("Period")
plt.show()

all_genres = [g for sublist in df_creators['Genre_list'] for g in sublist]   # Flatten all genres into a single list

unique_genres = sorted(set(all_genres))                                      # Get unique genres

print("🎭 GENRES :")
for genre in unique_genres:
    print("-", genre)

🎭 GENRES :
- Comedy
- Court Masque
- Domestic Tragedy
- Drama
- Masque
- Morality
- Pageant
- Pastoral
- Poetry
- Satire
- Tragedy
- Tragicomedy
- Translation

# === Genres Bar Chart ===
plt.figure(figsize=(12,5))
colors = plt.cm.cividis(np.linspace(0,1,len(genre_counts)))
plt.bar(genre_counts.keys(), genre_counts.values(), color=colors)
plt.title("🎭 AUTHORS by GENRE", fontsize=14)
plt.ylabel("Count")
plt.xlabel("Genre")
plt.xticks(rotation=45)
plt.show()

# ------------------------------------------------------------------
# Iconic Creatures & Visual Motifs
# ------------------------------------------------------------------
# Flatten lists for counting
df_creators['IconicCreatures_list'] = df_creators['Iconic Creatures'].str.split(',\s*')        # Split by commas or spaces if needed
df_creators['VisualMotifs_list'] = df_creators['Visual Motifs'].str.split(',\s*')

all_creatures = [c for sublist in df_creators['IconicCreatures_list'] for c in sublist]        # Flatten all
all_motifs = [m for sublist in df_creators['VisualMotifs_list'] for m in sublist]

creature_counts = Counter(all_creatures)                                                       # Count frequency
motif_counts = Counter(all_motifs)

# === WordCloud for Creatures ===
wc_creatures = WordCloud(width=800, height=400, background_color='black', colormap='Reds')
wc_creatures.generate_from_frequencies(creature_counts)

plt.figure(figsize=(12,6))
plt.imshow(wc_creatures, interpolation='bilinear')
plt.axis('off')
plt.title("✨ ICONIC CREATURES", fontsize=16)
plt.show()

# ------------------------------------------------------------------
# English Roots
# ------------------------------------------------------------------
birthplace_counts = df_creators['Birthplace'].value_counts()

print("🏰 AUTHORS per BIRTHPLACE :")
print(birthplace_counts)

🏰 AUTHORS per BIRTHPLACE :
Birthplace
London                 8
Leicestershire         1
Hitchin                1
Rye                    1
Devon                  1
Norwich                1
Lincolnshire           1
Kent                   1
Coventry               1
Canterbury             1
Salisbury              1
Lowestoft              1
Stratford-upon-Avon    1
Name: count, dtype: int64

london_authors = df_creators[df_creators['Birthplace'] == 'London']
london_authors[['Author', 'Period', 'Genre']]

plt.figure(figsize=(12,3))
colors = plt.cm.viridis(np.linspace(0,1,len(birthplace_counts)))
bars = plt.bar(birthplace_counts.index, birthplace_counts.values, color=colors)
plt.title("🏰 AUTHORS per BIRTHPLACE", fontsize=16)
plt.ylabel("Nber of Authors")
plt.xlabel("Birthplace")
plt.xticks(rotation=45)
plt.show()

# ------------------------------------------------------------------
# Major Works
# ------------------------------------------------------------------
titles_text = ' '.join(df_creators['Major Works'].str.replace('*', '', regex=False))    # Clean titles : remove asterisks

wordcloud = WordCloud(width=800, height=400, background_color='white').generate(titles_text)

plt.figure(figsize=(12,6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("MAJOR WORKS TITLES", fontsize=16)
plt.show()

titles_text = ' '.join(df_creators['Major Works'].str.replace('*','',regex=False))    # Combine all titles

tokens = re.findall(r'\b[a-zA-Z]+\b', titles_text.lower())        # Tokenize words, keep only alphabetic

stopwords = {'the', 's', 'a', 'd', 'of', 'and'}                   # Define stopwords

filtered_tokens = [t for t in tokens if t not in stopwords]       # Filter tokens

filtered_counts = Counter(filtered_tokens)                        # Count top words
top_filtered = filtered_counts.most_common(10)

print("Top thematic words in titles (stopwords removed):")
for word, count in top_filtered:
    print(f"{word}: {count}")

Top thematic words in titles (stopwords removed):
tragedy: 2
shoemaker: 2
whore: 2
spanish: 2
friar: 2
traveller: 2
old: 2
women: 2
philaster: 1
maid: 1

words_2x = [word for word, count in filtered_counts.items() if count == 2]     # Words with 2 occurrences

# Function to check if a title contains any of these words
def has_2x_word(title):  
    clean_title = re.sub(r'\*', '', title).lower()           # Clean title
    tokens = re.findall(r'\b[a-zA-Z]+\b', clean_title)
    return any(word in tokens for word in words_2x)

titles_with_2x = df_creators['Major Works'][df_creators['Major Works'].apply(has_2x_word)].tolist()     # Filter titles

print("TITLES CONTAINING WORDS WITH 2 OCCURRENCES :")
for t in titles_with_2x:
    print(t)

TITLES CONTAINING WORDS WITH 2 OCCURRENCES :
*Philaster*, *The Maid's Tragedy*
*The Shoemaker's Holiday*, *The Honest Whore*
*The Faithful Shepherdess*, *The Spanish Curate*
*'Tis Pity She’s a Whore*
*Friar Bacon and Friar Bungay*, *Pandosto*
*A Woman Killed with Kindness*, *The English Traveller*
*The Spanish Tragedy*
*The Roman Actor*, *A New Way to Pay Old Debts*
*The Changeling*, *Women Beware Women*
*Summer's Last Will and Testament*, *The Unfortunate Traveller*
*The Old Wives' Tale*, *Edward I*
*A Shoemaker a Gentleman*, *The Witch*

# Clean titles : remove asterisks
df_creators['Clean_Titles'] = df_creators['Major Works'].str.replace('*','',regex=False)

# Split titles into a list
df_creators['Titles_List'] = df_creators['Clean_Titles'].str.split(',')

# Compute average length per title
df_creators['Avg_Title_Length'] = df_creators['Titles_List'].apply(lambda lst: sum(len(t.strip()) for t in lst)/len(lst))

plt.figure(figsize=(12,6))
plt.barh(df_creators['Author'], df_creators['Avg_Title_Length'], 
         color=plt.cm.plasma(np.linspace(0,1,len(df_creators))))
plt.xlabel("Avg Length of Major Works Titles")
plt.title("📜 AVERAGE TITLES LENGTH per AUTHOR")
plt.gca().invert_yaxis()  # longest on top
plt.show()

df_creators['Clean_Titles'] = df_creators['Major Works'].str.replace('*','',regex=False)  # Clean titles and split
df_creators['Titles_List'] = df_creators['Clean_Titles'].str.split(',')

titles_expanded = df_creators[['Author','Titles_List']].explode('Titles_List')            # Create a flattened DataFrame with one row per title
titles_expanded['Title'] = titles_expanded['Titles_List'].str.strip()
titles_expanded['Title_Length'] = titles_expanded['Title'].apply(len)

titles_sorted = titles_expanded.sort_values('Title_Length', ascending=False)              # Sort by length

print("TOP 3 lONGEST TITLES :")
print(titles_sorted.head(3)[['Author','Title','Title_Length']])

print("\nTOP 3 SHORTEST TITLES :")
print(titles_sorted.tail(3)[['Author','Title','Title_Length']])

TOP 3 lONGEST TITLES :
            Author                             Title  Title_Length
14    Thomas Nashe  Summer's Last Will and Testament            32
5    Robert Greene      Friar Bacon and Friar Bungay            28
6   Thomas Heywood      A Woman Killed with Kindness            28

TOP 3 SHORTEST TITLES :
                 Author    Title  Title_Length
17  William Shakespeare  Macbeth             7
17  William Shakespeare  Othello             7
17  William Shakespeare   Hamlet             6

# Group by birthplace using average title length per title
grouped_avg = df_creators.groupby('Birthplace')['Avg_Title_Length'].mean().sort_values(ascending=False)

plt.figure(figsize=(12,6))
grouped_avg.plot(kind='bar', color=plt.cm.cividis(np.linspace(0,1,len(grouped_avg))))
plt.ylabel("Avg Title Length per Title")
plt.title("🌍 BIRTHPLACE vs AVERAGE TITLE LENGTH")
plt.show()

	Author	Birth	Death	Birthplace	Period	Genre	Major Works	Style & Themes	Iconic Creatures	Visual Motifs	Relevance	Notes
count	20	20.00	20.00	20	20	20	20	20	20	20	20	20
unique	20	NaN	NaN	13	5	16	20	20	20	20	20	20
top	Francis Beaumont	NaN	NaN	London	Jacobean	Tragedy	Philaster, The Maid's Tragedy	Honor, loyalty, betrayal, romance	Disguised lovers, corrupt courtiers	💔🗡️👑🎭🕷️	Noted collaborator (Fletcher)	Early innovator of tragicomedy
freq	1	NaN	NaN	8	7	3	1	1	1	1	1	1
mean	NaN	1572.35	1622.45	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
std	NaN	11.91	20.11	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
min	NaN	1554.00	1592.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
25%	NaN	1562.75	1604.75	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
50%	NaN	1573.00	1626.50	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
75%	NaN	1580.75	1634.75	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
max	NaN	1596.00	1666.00	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

ToGitOrNotToGit 💀¶

EDA 🎭 `creators` dataset¶

	Author	Birth	Death	Birthplace	Period	Genre	Major Works	Style & Themes	Iconic Creatures	Visual Motifs	Relevance	Notes
0	Francis Beaumont	1584	1616	Leicestershire	Jacobean	Comedy, Tragicomedy	Philaster, The Maid's Tragedy	Honor, loyalty, betrayal, romance	Disguised lovers, corrupt courtiers	💔🗡️👑🎭🕷️	Noted collaborator (Fletcher)	Early innovator of tragicomedy
1	George Chapman	1559	1634	Hitchin	Jacobean	Tragedy, Translation	Bussy D'Ambois, Homer’s Iliad	Stoicism, heroism, philosophical depth	Noble outcasts, visionaries	🦉👑🗡️🔥🕯️	Literary bridge to classicism	Lofty, sometimes hard to stage
2	Thomas Dekker	1572	1632	London	Elizabethan	Comedy, Satire, Morality	The Shoemaker's Holiday, The Honest Whore	Urban life, resilience, wit, plague	Jesters, apprentices, tricksters	🎭🏙️🪶🧵💀	Urban voice, satirical chronicler	Known for vibrant city scenes, debt prison stints
3	John Fletcher	1579	1625	Rye	Jacobean, Caroline	Tragicomedy, Comedy	The Faithful Shepherdess, The Spanish Curate	Fluid morality, love vs. reason, comic intrigue	Servants, witty heroines	💔🎭🪶🧠🪞	Succeeded Shakespeare at the Globe	Master of tragicomedy twist
4	John Ford	1586	1639	Devon	Jacobean, Caroline	Tragedy, Morality	'Tis Pity She’s a Whore	Forbidden desire, fate, taboo, introspection	Incestuous lovers, avengers	💔💀🗡️🔥🕷️	Boundary-pushing moralist	Notorious for dark sexual politics

	Author	Birth	Death	Age
9	John Lyly	1554	1606	52
15	George Peele	1556	1596	40
5	Robert Greene	1558	1592	34
8	Thomas Kyd	1558	1594	36
1	George Chapman	1559	1634	75
11	Christopher Marlowe	1564	1593	29
17	William Shakespeare	1564	1616	52
14	Thomas Nashe	1567	1601	34
2	Thomas Dekker	1572	1632	60
7	Ben Jonson	1572	1637	65
6	Thomas Heywood	1574	1641	67
10	John Marston	1576	1634	58
3	John Fletcher	1579	1625	46
13	Thomas Middleton	1580	1627	47
19	John Webster	1580	1634	54
12	Philip Massinger	1583	1640	57
0	Francis Beaumont	1584	1616	32
16	William Rowley	1585	1626	41
4	John Ford	1586	1639	53
18	James Shirley	1596	1666	70

ToGitOrNotToGit 💀¶

EDA 🎭 creators dataset¶

EDA 🎭 `creators` dataset¶