The skill to scrape and analyze data gives any computer science student a sense of freedom never experienced before. Much of the world’s data is stored on web pages and people make important decisions based on it, think Finance, Politics, Culture, etc. This data can also reveal history of pretty much anything you are interested in. Imagine how exciting it would be to master scraping any data of your interest, building AI pipeline to analyze it, and creating (maybe commercial) applications to execute decisions in the real world based on what you build? So let’s dig in!

With this trick, you can get all urls of this site by manually incrementing the page number, and be able to scrape each page. Scraping is done in two stages, getting all the urls and then retrieving all personal information. Below is the Python 3.6 code to get all celebrities’ urls and store them in a pickled dict for next stage:
from bs4 import BeautifulSoup import requests from tqdm import tqdm import string from collections import defaultdict base = 'http://www.whosdatedwho.com/popular? \ letter={}&page={}&_block=data.browseGrid' all_person_list = defaultdict(lambda: dict()) for l in tqdm(list(string.ascii_lowercase)): for n in tqdm(range(1,1000)): c = requests.get(base.format(l,n)).content soup = BeautifulSoup(c) person_list = list( map(lambda x: x.a['href'], soup.find_all("li", {"class":"ff-grid-box ff-list"})) ) if len(person_list) == 0: break else: for p in person_list: p_name = re.findall(r'[^/]+(?=/$|$)',p)[0] all_person_list[p_name]['url'] = p import pickle with open('all-person-list.pickle','wb') as handle: pickle.dump(dict(all_person_list), handle, protocol=pickle.HIGHEST_PROTOCOL)
from bs4 import BeautifulSoup import requests from tqdm import tqdm from collections import defaultdict def get_person_info(l): c = requests.get(l).content soup = BeautifulSoup(c) hist = soup.find_all('div',{'id':'ff-dating-history-table'}) personal_info = {} relation_info = [] headers = list(map(lambda x: x.text, soup.find_all('div',{'class':'header'})[:3])) facts = list(map(lambda x: x.text, soup.find_all('div',{'class':'fact'})[:3])) footers = list(map(lambda x: x.text, soup.find_all('div',{'class':'footer'})[:3])) for i,h in enumerate(headers): if h is not None: personal_info[h.lower()] = facts[i].strip() if 'death' in footers[i].strip(): personal_info['dead'] = True else: personal_info['dead'] = False if 'at death' not in footers[i].strip() and 'years old' not in footers[i].strip() and 'total' not in footers[i].strip(): personal_info[h.lower()] = footers[i].strip() if len(hist) == 0: pass else: table = soup.find_all('div',{'id':'ff-dating-history-table'})[0].find('table') x = len(table.findAll('tr')) for row in table.findAll('tr')[1:x]: col = row.findAll('td') name = col[1].getText().strip() name_url = col[1].a['href'] status = col[2].getText().strip() time_start = col[4].getText().strip() time_end = col[5].getText().strip() duration = col[6].getText().strip() relation_info.append({'name':name, 'name_url':name_url,'time_start':time_start, 'time_end':time_end,'duration':duration}) return {'personal':personal_info, 'relation':relation_info} import pickle with open('all-person-list.pickle', 'rb') as handle: all_person = pickle.load(handle) for p in tqdm({j:all_person[j] for j in [i for i in all_person][:2]}): try: all_person[p]['info'] = get_person_info(all_person[p]['url']) except: print('fail') with open('all-person-info.pickle','wb') as handle: pickle.dump(dict(all_person), handle, protocol=pickle.HIGHEST_PROTOCOL)
import pickle with open('all-person-info.pickle', 'rb') as handle: info = pickle.load(handle) import re import numpy as np import seaborn as sns import dateparser from collections import defaultdict import networkx as nx from tqdm import tqdm from collections import Counter import matplotlib.pyplot as plt import pandas as pd def str_to_yr(i): if 'year' in i: return float(re.findall(r'\d+', i)[0]) if 'month' in i: return float(re.findall(r'\d+', i)[0]) / 12 else: return 'unknown' def get_yr(i): try: return re.findall('\d{4}', i)[0] except: return 'unknown' def horo_type_f(i): temp = {'Aquarius':'air', 'Aries':'fire', 'Cancer':'water', 'Capricorn':'earth', 'Gemini':'air', 'Leo':'fire', 'Libra':'air', 'Pisces':'water', 'Sagittarius':'fire', 'Scorpio':'water', 'Taurus':'earth', 'Virgo':'earth', 'unknown':'unknown'} return temp[i] def plotBar(l): zodiac_c = list(zip(*Counter(l).most_common())) plt.bar(range(len(zodiac_c[0])), zodiac_c[1], align='center') plt.xticks(range(len(zodiac_c[0])), zodiac_c[0],rotation='vertical') plt.show() return zodiac_c zodiac = [] zodiac_types = [] age_l = [] dead_l = [] zodiac_couple = [] zodiac_type_couple = [] couple_duration = [] couple_start = [] num_rela = [] name_shorts = [] relation_shorts = [] G=nx.Graph() i_temp = -1 for name_short in tqdm(info): name_shorts.append(name_short) i_temp += 1 if 'zodiac' in info[name_short]['info']['personal']: horo = info[name_short]['info']['personal']['zodiac'] else: horo = 'unknown' if 'age' in info[name_short]['info']['personal']: age = info[name_short]['info']['personal']['age'] else: age = 'unknown' if 'dead' in info[name_short]['info']['personal']: dead = info[name_short]['info']['personal']['dead'] else: dead = 'unknown' zodiac.append(horo) horo_type = horo_type_f(horo) zodiac_types.append(horo_type) age_l.append(age) dead_l.append(dead) if not len(info[name_short]['info']['relation']) > 0: num_rela.append(0) if len(info[name_short]['info']['relation']) > 0: num_rela.append(len(info[name_short]['info']['relation'])) for rela in info[name_short]['info']['relation']: name_short_other = re.findall(r'[^/]+(?=/$|$)',rela['name_url'])[0] relation_shorts.append('+'.join(sorted([name_short, name_short_other]))) duration = str_to_yr(rela['duration']) start = get_yr(rela['time_start']) if name_short_other not in info: horo_other = 'unknown' else: if 'zodiac' in info[name_short_other]['info']['personal']: horo_other = info[name_short_other]['info']['personal']['zodiac'] else: horo_other = 'unknown' horo_other_type = horo_type_f(horo_other) zodiac_couple.append(tuple(sorted([horo, horo_other]))) zodiac_type_couple.append(tuple(sorted([horo_type, horo_other_type]))) couple_duration.append(duration) couple_start.append(start) people_df = pd.DataFrame({'name': name_shorts,'zodiac':zodiac, 'zodiac_element':zodiac_types, 'age': age_l, 'num_rela':num_rela}) relation_df = pd.DataFrame({'couple': relation_shorts, 'start':couple_start, 'duration':couple_duration, 'zodiac':zodiac_couple, 'zodiac_element':zodiac_type_couple})
Now the fun part begins, let’s freely explore, this is how the data looks like:
This plots the relationship duration distribution:
temp = [i for i in relation_df.duration[relation_df.duration != 'unknown'].tolist() if i < 80] sns.distplot(temp, axlabel='year')

temp = people_df[(people_df.zodiac_element != 'unknown') & (people_df.age != 'unknown') & (people_df.age != 'year old')] temp['age'] = temp['age'].apply(lambda x : int(x)) temp = temp.groupby(temp.zodiac_element)[['age']].mean() temp.rename(index=str,columns={'age':'mean_age'}).plot.bar()

temp = people_df[(people_df.zodiac_element != 'unknown') & (people_df.age != 'unknown') & (people_df.age != 'year old')] temp['age'] = temp['age'].apply(lambda x : int(x)) temp = temp.groupby(temp.zodiac_element)[['num_rela']].median() temp.rename(index=str,columns={'num_rela':'median_num_rela'}).plot.bar()

temp = relation_df[(relation_df.duration != 'unknown') & (relation_df.duration != 'year old') & (relation_df.zodiac_element.apply(lambda x: x[0]) != 'unknown') & (relation_df.zodiac_element.apply(lambda x: x[1]) != 'unknown')] temp['duration'] = temp.duration.astype(float) temp1 = temp.groupby(temp.zodiac_element)[['duration']].count() temp = relation_df[(relation_df.duration != 'unknown') & (relation_df.duration != 'year old') & (relation_df.zodiac_element.apply(lambda x: x[0]) != 'unknown') & (relation_df.zodiac_element.apply(lambda x: x[1]) != 'unknown')] temp['duration'] = temp.duration.astype(float) temp2 = temp.groupby(temp.zodiac_element)[['duration']].median() temp1['duration'] = temp1['duration'] / 1000 temp1 = temp1.rename(index=str, columns={'duration':'count (Thousands)'}) temp2 = temp2.rename(index=str, columns={'duration':'median_duration'})<span id="mce_SELREST_start" style="overflow:hidden;line-height:0;"></span>


n = 10000 l = [0] * n + [1] * n + [2] * n + [3] * n np.random.shuffle(l) from itertools import tee def pairwise(iterable): "s -> (s0,s1), (s1,s2), (s2, s3), ..." a, b = tee(iterable) next(b, None) return zip(a, b) plotBar(list(pairwise(l)))

See, all combinations have similar numbers. It would be interesting to see if same trend exists on other dating platforms, and if it persists, then well, the myth would be true and we should date outside of our zodiac element? 😂
Now let’s analyze the data from a different lens: Graph🤯.
Unfinished.