# Final Project

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from matplotlib.animation import FuncAnimation
from matplotlib.animation import FFMpegWriter
from matplotlib import cm
from scipy import interpolate

In [None]:
# Dataset Source: Kaggle 
# https://www.kaggle.com/datasets/conorvaneden/best-songs-on-spotify-for-every-year-2000-2023/ 
file_path = 'spotify.csv'

# df = pd.read_csv(file_path, error_bad_lines=False)
df = pd.read_csv(file_path, delimiter=';', error_bad_lines=False)

# just to check
df.head()

In [None]:
# relevant data
titles = df["title"]
genre = df["top genre"]
year = df["year"]
bpm = df["bpm"]
unique_years = pd.Series(year[year >= 2000].unique()).sort_values().unique()

# Plotting Interesting Trends

## Plot: Most Popular Words in Song Titles Across All Years

In [None]:
# Most Popular Words in Song Titles

all_titles = ' '.join(titles)
# remove special chars + make lowercase
all_titles = re.sub(r'[^a-zA-Z\s]', '', all_titles)
all_titles = all_titles.lower()

words = all_titles.split()

# remove certain words like articles, "feat", etc
exclude = ['feat', 'with', 'the', 'a', 'i', 'on', 'in', 'to', 'it', 'radio', 'edit', 'remix', 'of']
filtered_words = [word for word in words if word not in exclude]

unique_words, word_counts = np.unique(filtered_words, return_counts=True)

# sort in descending order
sorted_indices = np.argsort(word_counts)[::-1]
unique_words = unique_words[sorted_indices]
word_counts = word_counts[sorted_indices]

top_words = 20  # adjust val if want more
plt.figure(figsize=(10, 5))
plt.bar(range(top_words), word_counts[:top_words], tick_label=unique_words[:top_words], color='skyblue')
plt.xticks(rotation=45, ha="right")
plt.xlabel('Words')
plt.ylabel('Frequency')
plt.title('Top Words in Song Titles (filtered)')
plt.show()

## Plot: Tracking Some of Those Most Popular Words (and Other Interesting Ones) By Year

In [None]:
# Tracking Some of Those Most Popular Words (and Other Interesting Ones) By Year

base1 = df[(df["top genre"].str.lower().str.contains('pop')) & (df["year"] >= 2000)]
base2 = base1["year"].value_counts().sort_index()
words_to_track = ["love", "my", "you", "light", "me", "day"]
def count_words(text, word):
    return sum(1 for _ in re.finditer(r'\b' + re.escape(word) + r'\b', text, flags=re.IGNORECASE))

plt.figure(figsize=(12, 8))
for word in words_to_track:
    word_counts_by_year = df[df['year'] >= 2000].groupby('year')['title'].apply(lambda titles: titles.apply(lambda title: count_words(title, word)).sum())
    plt.plot(word_counts_by_year.index, word_counts_by_year.values, label=word, marker='o')

plt.xlabel('Year')
plt.ylabel('Frequency')
plt.title('Word Frequency from 2000 to 2023')
plt.xticks(base2.index, rotation=45)
plt.grid(True)
plt.legend()
plt.show()

## Plot: Number of Pop Songs Per year

In [None]:
# Number of Pop Songs Per Year

pop_songs = df[(df["top genre"].str.lower().str.contains('pop')) & (df["year"] >= 2000)]
pop_counts = pop_songs["year"].value_counts().sort_index()
plt.figure(figsize=(12, 6))
plt.plot(pop_counts.index, pop_counts.values, marker='o', linestyle='-', color='orange')
plt.xlabel('Year')
plt.ylabel('Number of Pop Songs')
plt.title('Number of Popular Songs Classified as Pop per Year')
plt.xticks(pop_counts.index, rotation=45)
plt.grid(True)
plt.show()

## Plot: Most Popular BPM Per Year

In [None]:
# Most Popular BPM Per Year

yearly_max_bpm = df[df['year'] >= 2000].groupby('year')['bpm'].max()
plt.figure(figsize=(12, 6))
plt.plot(yearly_max_bpm.index, yearly_max_bpm.values, marker='o', linestyle='-', color='green')
plt.xlabel('Year')
plt.ylabel('Most Popular/Common BPM')
plt.title('Most Popular BPM per Year')
plt.xticks(yearly_max_bpm.index, rotation=45)
plt.grid(True)
plt.show()

## Plot: Genres (of Interest) Counts Per Year

In [None]:
# Plotting Genres of Interest Counts Per Year

df_filtered = df[df['year'] >= 2000]

# defined genres of interest
genres_of_interest = ['hip hop', 'pop', 'rap', 'r&b', 'rock']
plt.figure(figsize=(15, 6))
colors = plt.cm.tab10.colors

for genre in genres_of_interest:
    genre_counts = df_filtered[df_filtered['top genre'].str.lower().str.contains(genre)]['year'].value_counts().sort_index()
    plt.plot(genre_counts.index, genre_counts.values, marker='o', linestyle='-', label=genre, color=colors[genres_of_interest.index(genre)])

plt.xlabel('Year')
plt.ylabel('Total Count')
plt.title('Total Count of Selected Genres from 2000 to 2023')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.xticks(pop_counts.index, rotation=45)
plt.ylim(bottom=0, top=80)
plt.yticks(range(0, 81, 10))

plt.show()

# Simulation Method: Random Walks

### Generate 3-word song titles using random walks based on existing song titles and most frequently used words (using a markov chain approach)

In [None]:
# Generate 3-word song titles using random walks based on existing song titles and most frequently used words,
# Using a Markov Chain approach

all_titles = ' '.join(titles)
all_titles = re.sub(r'\([^)]*\)', '', all_titles)
all_titles = re.sub(r'[^a-zA-Z\s]', '', all_titles)
all_titles = all_titles.lower()
words = all_titles.split()


exclude = ['feat', 'with', 'the', 'a', 'i', 'on', 'in', 'to', 'it', 'radio', 'edit', 'remix', 'of']
filtered_words = [word for word in words if word not in exclude]

def create_transition_matrix(words):
    unique_words = np.unique(words)
    word_to_index = {word: i for i, word in enumerate(unique_words)}
    transition_matrix = np.zeros((len(unique_words), len(unique_words)))
    for i in range(len(words) - 1):
        current_word = words[i]
        next_word = words[i + 1]
        current_index = word_to_index[current_word]
        next_index = word_to_index[next_word]
        transition_matrix[current_index, next_index] += 1
    row_sums = transition_matrix.sum(axis=1, keepdims=True)
    transition_matrix = transition_matrix / row_sums
    return transition_matrix, unique_words

transition_matrix, unique_words = create_transition_matrix(filtered_words)

def generate_title(transition_matrix, unique_words, num_words=3):
    current_word = np.random.choice(unique_words)
    title = [current_word]
    for _ in range(num_words - 1):
        current_index = np.where(unique_words == current_word)[0][0]
        # looking at current word's row, considering the probabilites associated with
        # each next word as well as the randomness
        next_word_index = np.random.choice(len(unique_words), p=transition_matrix[current_index])
        next_word = unique_words[next_word_index]
        title.append(next_word)
        current_word = next_word
    return ' '.join(title)

# generate 5 random 3-word titles
for _ in range(5):
    generated_title = generate_title(transition_matrix, unique_words)
    print(generated_title)

## Visualization
### Recommend Rerunning Multiple Times Until Find a Word With Multiple Non-Zero Probabilities

In [None]:
def generate_title_with_visualization(transition_matrix, unique_words, num_words=3):
    current_word = np.random.choice(unique_words)
    title = [current_word]

    for _ in range(num_words - 1):
        current_index = np.where(unique_words == current_word)[0][0]
        next_word_index = np.random.choice(len(unique_words), p=transition_matrix[current_index])
        next_word = unique_words[next_word_index]
        visualize_transition_probabilities(current_word, unique_words, transition_matrix[current_index])
        title.append(next_word)
        current_word = next_word

    return ' '.join(title)

def visualize_transition_probabilities(current_word, unique_words, probabilities):
    nonzero_probabilities = [(word, prob) for word, prob in zip(unique_words, probabilities) if prob > 0]
    if not nonzero_probabilities:
        print(f'No nonzero probabilities for transition from "{current_word}" to any word.')
        return
    words, probs = zip(*nonzero_probabilities)
    plt.figure(figsize=(10, 6))
    plt.bar(words, probs, color='skyblue')
    plt.title(f'Transition Probabilities from "{current_word}" to Next Words')
    plt.xlabel('Next Word')
    plt.ylabel('Probability')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()
    
generated_title = generate_title_with_visualization(transition_matrix, unique_words)
print(generated_title)

## Animation 1: Random Walks

In [None]:
def visualize_transition_probabilities(current_word, unique_words, probabilities, ax):
    nonzero_probabilities = [(word, prob) for word, prob in zip(unique_words, probabilities) if prob > 0]
    if not nonzero_probabilities:
        return
    words, probs = zip(*nonzero_probabilities)
    word_indices = range(len(words))
    ax.clear()
    ax.bar(word_indices, probs, color='skyblue')
    ax.set_ylim(0, 1)
    ax.set_title('Transition Probabilities')
    ax.set_xlabel('Next Word')
    ax.set_ylabel('Probability')
    ax.set_xticks(word_indices)
    ax.set_xticklabels(words, rotation=45, ha='right')
    ax.grid(True)
    ax.set_title(f'"{current_word}" to Next Words', pad=20)
    
def generate_title(transition_matrix, unique_words, num_words=3):
    current_word = np.random.choice(unique_words)
    title = [current_word]
    for _ in range(num_words - 1):
        current_index = np.where(unique_words == current_word)[0][0]
        next_word_index = np.random.choice(len(unique_words), p=transition_matrix[current_index])
        next_word = unique_words[next_word_index]
        title.append(next_word)
        current_word = next_word
    return title

def update(frame):
    ax1.clear()
    ax2.clear()
    ax3.clear()
    word_choices = generate_title(transition_matrix, unique_words, num_words=3)
    for i, current_word in enumerate(word_choices):
        current_index = np.where(unique_words == current_word)[0][0]
        visualize_transition_probabilities(current_word, unique_words, transition_matrix[current_index], axs[i])
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5))
axs = [ax1, ax2, ax3]
max_frame = 50
animation = FuncAnimation(fig, update, frames=range(max_frame), repeat=False)
writer = FFMpegWriter(fps=2, metadata=dict(artist='Me'), bitrate=1800)
animation.save('animation1.mp4', writer=writer)

plt.show()

# 3 Animations

## Animation 2: Genres Plotting

In [None]:
# Animation for Genres Plotting

df = df.sort_values(by='year', ascending=True)
df_filtered = df[df['year'] >= 2000]

# defined genres of interest
genres_of_interest = ['hip hop', 'pop', 'rap', 'r&b', 'rock']

fig, ax = plt.subplots(figsize=(10, 6))
colors = plt.cm.tab10.colors
lines = []
for i, genre in enumerate(genres_of_interest):
    line, = ax.plot(unique_years, [0] * len(unique_years), marker='o', linestyle='-', label=genre, color=colors[i])
    lines.append(line)

ax.set_xlabel('Year')
ax.set_ylabel('Total Count')
plt.title('Total Count of Selected Genres from 2000 to 2023')
plt.xticks(unique_years, rotation=45)
plt.ylim(bottom=0, top=80)
plt.yticks(range(0, 81, 10))

def update(frame):
    for i, genre in enumerate(genres_of_interest):
        genre_counts = df_filtered[df_filtered['top genre'].str.lower().str.contains(genre)]['year'].value_counts().sort_index()
        years_sorted = sorted(unique_years[:frame + 1])
        y_values = [genre_counts.get(year, 0) for year in years_sorted]
        assert len(years_sorted) == len(y_values), "Length mismatch between x and y data"
        lines[i].set_data(years_sorted, y_values)

animation = FuncAnimation(fig, update, frames=len(df_filtered['year'].unique()), interval=500, repeat=False)

ax.legend(title='Genre')
ax.grid(True)

# saving animation as MP4 file
writer = FFMpegWriter(fps=1, metadata=dict(artist='Me'), bitrate=1800)
animation.save('animation2.mp4', writer=writer)

plt.show()

## Animation 3: BPM Plotting

In [None]:
# Animation for BPM plotting

def update(frame):
    plt.clf()
    yearly_max_bpm_current = yearly_max_bpm[yearly_max_bpm.index <= frame]
    plt.plot(yearly_max_bpm_current.index, yearly_max_bpm_current.values, marker='o', color='blue')
    plt.xlabel('Year')
    plt.ylabel('Most Popular/Common BPM')
    plt.title('Most Popular BPM per Year')
    plt.xticks(yearly_max_bpm_current.index, rotation=45)
    plt.grid(axis='y')

yearly_max_bpm = df[df['year'] >= 2000].groupby('year')['bpm'].max()
fig, ax = plt.subplots(figsize=(12, 6))

max_year = yearly_max_bpm.index.max()
animation = FuncAnimation(fig, update, frames=range(2000, max_year + 1), repeat=False)

# saving animation as MP4 file
writer = FFMpegWriter(fps=1, metadata=dict(artist='Me'), bitrate=1800)
animation.save('animation3.mp4', writer=writer)

plt.show()

## Animation 4: Interesting Words in Song Titles Per Year Plotting

In [None]:
# Animation for Interesting Words in Song Titles Per Year Plotting

df_filtered = df[df['year'] >= 2000]
words_to_track = ["love", "my", "you", "light", "me", "day"]

def count_words(text, word):
    return sum(1 for _ in re.finditer(r'\b' + re.escape(word) + r'\b', text, flags=re.IGNORECASE))

def update(frame):
    plt.clf()
    plt.title(f'Word Frequency from 2000 to 2023')
    for i, word in enumerate(words_to_track):
        word_counts_by_year = df_filtered[df_filtered['year'] <= frame].groupby('year')['title'].apply(lambda titles: titles.apply(lambda title: count_words(title, word)).sum())
        plt.plot(word_counts_by_year.index, word_counts_by_year.values, label=word, marker='o')
    plt.xticks(pop_counts.index, rotation=45)
    plt.xlabel('Year')
    plt.ylabel('Frequency')
    plt.legend()
    plt.grid(True)

fig, ax = plt.subplots(figsize=(12, 8))
max_year = df_filtered[df_filtered['year'] >= 2000]['year'].max()

animation = FuncAnimation(fig, update, frames=range(2000, max_year + 1), repeat=False)
# saving animation as MP4 file
writer = FFMpegWriter(fps=1, metadata=dict(artist='Me'), bitrate=1800)
animation.save('animation4.mp4', writer=writer)

plt.show()