In [1]:
# Load modules
import pandas as pd
import six
import seaborn as sns
import geopandas as gpd
import json
import numpy as np
from os import path
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
import matplotlib
if not path.exists('Sarabun-Regular.ttf'):
    !wget https://github.com/google/fonts/raw/main/ofl/sarabun/Sarabun-Regular.ttf -O Sarabun-Regular.ttf
if 'Sarabun' not in [x.name for x in matplotlib.font_manager.fontManager.ttflist]:
    matplotlib.font_manager.fontManager.addfont('Sarabun-Regular.ttf')
matplotlib.rc('font', family='Sarabun')
In [2]:
# Load the JSON files from https://github.com/PanJ/senate67
with open('province-districts.json', 'r', encoding='utf-8') as file:
    province_districts = json.load(file)

with open('district-candidates-round1-1717151315752.json', 'r', encoding='utf-8') as file:
    candidates = json.load(file)

# Create professional and social groups
groups = pd.read_csv('group.csv')
In [3]:
# Create DataFrame for province and districts
province_list = []
for province in province_districts:
    province_id = province['id']
    province_name = province['name']
    for district in province['districts']:
        district_id = district['id']
        district_name = district['name']
        province_list.append({
            'province_id': province_id,
            'province_name': province_name,
            'district_id': district_id,
            'district_name': district_name
        })
df_provinces = pd.DataFrame(province_list)
In [4]:
# Merge candidates and province/district names
df_candidates = pd.DataFrame(candidates).rename(columns={'province': 'province_id', 'district': 'district_id'})
merged_df = df_candidates.merge(df_provinces, on=['province_id', 'district_id']).merge(groups, on = 'job_group')
merged_df['id'] = merged_df.index
In [5]:
# Count the number of candidates for each title
title_counts = merged_df['title'].value_counts().reset_index()
title_counts.columns = ['title', 'total_candidates']

# Display the gender counts
title_counts
Out[5]:
title total_candidates
0 นาย 24680
1 นาง 12572
2 น.ส. 6854
3 ร.ต.อ. 269
4 พ.ต.อ. 153
... ... ...
57 ว่าที่ พ.ต.ต. 1
58 นายกองตรี 1
59 ว่าที่ พ.ต.อ. 1
60 นายหมวดเอก 1
61 นายกองเอก 1

62 rows × 2 columns

In [9]:
def render_mpl_table(data, col_width=3.0, row_height=0.625, font_size=14,
                     header_color='#40466e', row_colors=['#f1f1f2', 'w'], edge_color='w',
                     bbox=[0, 0, 1, 1], header_columns=0,
                     ax=None, **kwargs):
    if ax is None:
        size = (np.array(data.shape[::-1]) + np.array([0, 1])) * np.array([col_width, row_height])
        fig, ax = plt.subplots(figsize=size)
        ax.axis('off')

    mpl_table = ax.table(cellText=data.values, bbox=bbox, colLabels=data.columns, **kwargs)

    mpl_table.auto_set_font_size(False)
    mpl_table.set_fontsize(font_size)

    for k, cell in  six.iteritems(mpl_table._cells):
        cell.set_edgecolor(edge_color)
        if k[0] == 0 or k[1] < header_columns:
            cell.set_text_props(weight='bold', color='w')
            cell.set_facecolor(header_color)
        else:
            cell.set_facecolor(row_colors[k[0]%len(row_colors) ])
    return ax

render_mpl_table(title_counts, header_columns=0, col_width=2.0).figure.savefig("figures/title.jpg", dpi=300)
No description has been provided for this image
In [10]:
# Create the facet grid for age distribution
g = sns.FacetGrid(merged_df, col='job_name', hue='title', col_wrap=4, height=4, sharey=False)
g.map(sns.histplot, 'age', kde=False, bins=15, alpha=0.6)

# Add titles and labels
g.set_axis_labels('Age', 'Count')
g.set_titles(col_template="{col_name}")
g.add_legend(title="Title")

# Adjust layout
plt.subplots_adjust(top=0.9)
g.fig.suptitle('Age Distribution within Each Occupation Group', fontsize=16)

# Show plot
plt.show()
No description has been provided for this image
In [11]:
# Count the number of candidates for each occupation
top_occupations = merged_df['job_name'].value_counts().reset_index()
top_occupations.columns = ['occupation', 'total_candidates']

# Table
render_mpl_table(top_occupations, header_columns=0, col_width=8.0).figure.savefig("figures/top_occupations.jpg", dpi=300)
No description has been provided for this image
In [12]:
# Plot the top occupations using a bar chart
top_occupations = top_occupations.sort_values('total_candidates', ascending = True)
plt.figure(figsize=(16, 16))
bars = plt.barh(top_occupations['occupation'].astype(str), top_occupations['total_candidates'], color='skyblue')
plt.xlabel('Total Number of Candidates', fontsize=14)
plt.title('Number of Candidates by Group', fontsize=14)
plt.yticks(fontsize=14)  # Increase y-tick label font size

plt.xticks(rotation=90, ha='right')
plt.tight_layout()

# Add the exact amount of candidates for each occupation
for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{bar.get_width()}', va='center', ha='left')

plt.savefig("figures/candidates_by_group.jpg", dpi=300)
No description has been provided for this image
In [13]:
# Group by province and district to count the number of candidates in each district
district_candidate_counts = (
    merged_df
    .groupby(['province_id', 'district_id'])
    .agg(total_candidates=('id', 'size'),
         province_name=('province_name', 'first'),
         district_name=('district_name', 'first'))
    .reset_index()
)

# Sort by total_candidates in descending order
district_candidate_counts = district_candidate_counts.sort_values(by='total_candidates', ascending=True)

# Display the result
district_candidate_counts.tail(20)
Out[13]:
province_id district_id total_candidates province_name district_name
272 33 4 197 ศรีสะเกษ กันทรลักษ์
362 40 1 198 ขอนแก่น เมืองขอนแก่น
187 26 4 198 นครนายก องครักษ์
754 74 2 201 สมุทรสาคร กระทุ่มแบน
97 16 6 205 ลพบุรี บ้านหมี่
341 37 1 207 อำนาจเจริญ เมืองอำนาจเจริญ
713 70 1 213 ราชบุรี เมืองราชบุรี
69 14 1 219 พระนครศรีอยุธยา พระนครศรีอยุธยา
184 26 1 222 นครนายก เมืองนครนายก
513 49 3 233 มุกดาหาร ดอนตาล
50 11 1 234 สมุทรปราการ เมืองสมุทรปราการ
753 74 1 251 สมุทรสาคร เมืองสมุทรสาคร
52 11 3 251 สมุทรปราการ บางพลี
859 90 11 256 สงขลา หาดใหญ่
515 49 5 266 มุกดาหาร คำชะอี
92 16 1 268 ลพบุรี เมืองลพบุรี
511 49 1 269 มุกดาหาร เมืองมุกดาหาร
269 33 1 295 ศรีสะเกษ เมืองศรีสะเกษ
759 76 1 317 เพชรบุรี เมืองเพชรบุรี
518 50 1 344 เชียงใหม่ เมืองเชียงใหม่
In [14]:
# Plot the number of candidates using a bar chart
district_candidate_counts['province_district'] = district_candidate_counts['province_name'] + '_' + district_candidate_counts['district_name']

plt.figure(figsize=(15, 20))
bars = plt.barh(district_candidate_counts.tail(50)['province_district'].astype(str), district_candidate_counts.tail(50)['total_candidates'], color='skyblue')
plt.ylabel('Province', fontsize=14)
plt.xlabel('Total Number of Candidates (Top 50)', fontsize=14)
plt.title('Number of Candidates by District', fontsize=14)
plt.xticks(rotation=90, ha='right', fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()

# Add the exact amount of candidates for each province
for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{bar.get_width()}', va='center', ha='left', fontsize=14)

plt.savefig("figures/candidates_by_district.jpg", dpi=300)
No description has been provided for this image
In [15]:
# Group by province to count the number of candidates in each province
province_candidate_counts = (
    merged_df
    .groupby('province_id')
    .agg(total_candidates=('id', 'size'),
         province_name=('province_name', 'first'))
    .reset_index()
)

# Sort by total_candidates in descending order
province_candidate_counts = province_candidate_counts.sort_values(by='total_candidates', ascending=True)
    
# Display the result
province_candidate_counts.tail(20)
Out[15]:
province_id total_candidates province_name
33 45 700 ร้อยเอ็ด
23 35 723 ยโสธร
71 91 737 สตูล
45 57 802 เชียงราย
25 37 802 อำนาจเจริญ
1 11 815 สมุทรปราการ
30 42 859 เลย
73 93 911 พัทลุง
55 70 1007 ราชบุรี
61 76 1073 เพชรบุรี
20 32 1115 สุรินทร์
6 16 1137 ลพบุรี
37 49 1242 มุกดาหาร
70 90 1595 สงขลา
4 14 1614 พระนครศรีอยุธยา
63 80 1699 นครศรีธรรมราช
19 31 1785 บุรีรัมย์
38 50 1902 เชียงใหม่
0 10 2439 กรุงเทพมหานคร
21 33 2674 ศรีสะเกษ
In [16]:
# Plot the number of candidates using a bar chart
plt.figure(figsize=(15, 20))
bars = plt.barh(province_candidate_counts['province_name'].astype(str), province_candidate_counts['total_candidates'], color='skyblue')
plt.ylabel('Province', fontsize=14)
plt.xlabel('Total Number of Candidates', fontsize=14)
plt.title('Number of Candidates by Province', fontsize=14)
plt.xticks(rotation=90, ha='right', fontsize=14)
plt.yticks(fontsize=14)
plt.tight_layout()

# Add the exact amount of candidates for each province
for bar in bars:
    plt.text(bar.get_width(), bar.get_y() + bar.get_height()/2, f'{bar.get_width()}', va='center', ha='left', fontsize=14)
    
#for bar in bars:
    #plt.text(bar.get_x() + bar.get_width() / 2, bar.get_height(), f'{int(bar.get_height())}', ha='center', va='bottom')

plt.savefig("figures/candidates_by_province.jpg", dpi=300)
No description has been provided for this image
In [17]:
#### Mapping candidate distribution ####

# Import shapefile
map_df = gpd.read_file('tha_adm_rtsd_itos_20210121_shp/tha_admbnda_adm2_rtsd_20220121.shp')

# Prepare data
map_df['province_id'] = map_df['ADM1_PCODE'].str[2:].astype(int)
map_df['district_id'] = map_df['ADM2_PCODE'].str[4:].astype(int)
map_df['รหัสสำนักทะเบียน'] = map_df['ADM2_PCODE'].str[2:].astype(int)

# Merge
gdf = map_df.merge(district_candidate_counts, on = ['province_id', 'district_id'], how = 'left')
In [18]:
# Plot the data
fig, ax = plt.subplots(1, 1, figsize=(20, 20))
gdf.boundary.plot(ax=ax, linewidth=0.1)
gdf.plot(column='total_candidates', ax=ax, legend=True, cmap='rocket_r', linewidth=0.5, edgecolor='0.5')

# Add title and labels
plt.title('Total Number of Candidates by District in Thailand', fontsize=14)
ax.set_axis_off()

# Show plot
plt.savefig("figures/map.jpg", dpi=300)
No description has been provided for this image
In [17]:
# Import population data
pop = pd.read_excel('stat_a66.xlsx')

# Merge
gdf = gdf.merge(pop, on = 'รหัสสำนักทะเบียน',  how = 'left')

# Calculate candidate density
gdf['candidate_density'] = gdf['total_candidates']/gdf['จำนวนประชากรทั้งหมด']

# Log transform
gdf['log_candidate_density'] = gdf['candidate_density'].apply(lambda x: 0 if x == 0 else (np.log(x + 1)))

# Show table
gdf.dropna(subset=['candidate_density'], inplace=True)
table_density = gdf.sort_values(by ='candidate_density', ascending = False)[['district_name', 'province_name', 'total_candidates', 'จำนวนประชากรทั้งหมด', 'candidate_density']].head(20)

# Save
render_mpl_table(table_density, header_columns=0, col_width=5.0).figure.savefig("figures/table_density.jpg", dpi=300)
No description has been provided for this image
In [18]:
# Plot the data
fig, ax = plt.subplots(1, 1, figsize=(20, 20))
gdf.boundary.plot(ax=ax, linewidth=0.1)
gdf.plot(column='log_candidate_density', ax=ax, legend=True, cmap='rocket_r', linewidth=0.5, edgecolor='0.5')

# Add title and labels
plt.title('Log-Transformed Candidate Density by District in Thailand', fontsize=14)
ax.set_axis_off()

# Show plot
plt.savefig("figures/map_density.jpg", dpi=300)
No description has been provided for this image
In [20]:
# Calculate the median candidates per district for each province
per_province = district_candidate_counts.groupby('province_name')['total_candidates'].median().sort_values(ascending=False)

# Create a box plot
plt.figure(figsize=(15, 6))
sns.boxplot(x=district_candidate_counts['province_name'], y=district_candidate_counts['total_candidates'], order=per_province.index, palette="viridis")
plt.title('Candidates Distribution per District by Province')
plt.xlabel('Province')
plt.ylabel('Number of Candidates (District)')
plt.xticks(rotation=90)
plt.tight_layout()

plt.savefig("figures/distribution.jpg", dpi=300)
No description has been provided for this image
In [21]:
# Pivot
heatmap_data = merged_df.pivot_table(index='job_name', columns='province_name', values='id', aggfunc='count', fill_value=0)

# Get job_name-job_group mapping
job_group_mapping = groups.set_index('job_name')
heatmap_data = heatmap_data.join(job_group_mapping).sort_values('job_group').drop(columns='job_group')

# Create the heatmap with a better colormap scale
plt.figure(figsize=(30, 10))
heatmap = sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5, annot_kws={"size": 12})

# Add a color bar for interpretation
cbar = heatmap.collections[0].colorbar
cbar.set_label('จำนวนผู้สมัคร', fontsize=14)

# Add titles and labels
plt.title(f'การกระจายตัวของผู้สมัครตามกลุ่มและจังหวัดในประเทศไทย', fontsize=16)
plt.xlabel('จังหวัด', fontsize=14)
plt.ylabel('กลุ่ม', fontsize=14)
plt.xticks(rotation=45, ha='right', fontsize=14)
plt.yticks(rotation=0, fontsize=14)

# Adjust layout and save the plot
plt.tight_layout()
plt.savefig(f"figures/heatmap_province.jpg", dpi=300)

# Store province
with open(f'heatmaps/province.txt', 'x') as f:
    f.write(str(heatmap_data.columns.to_list()
))

# Drop index
heatmap_data = heatmap_data.reset_index(drop=True)

# Create data array
data_array = []
for job_index, row in heatmap_data.iterrows():
    for province_index, count in enumerate(row[1:]):  # Skip the first column which is job_name
        data_array.append([province_index, job_index, count])
with open(f'heatmaps/province_data.txt', 'x') as f:
    f.write(str(data_array))
No description has been provided for this image
In [111]:
# Define the function to create and save a heatmap for a given province
def create_province_heatmap(df, province_name, output_dir="figures"):
    # Filter the data for the current province
    province_data = df[df['province_name'] == province_name]

    # Create a pivot table for the heatmap
    heatmap_data = province_data.pivot_table(index='job_name', columns='district_name', values='id', aggfunc='count', fill_value=0)

    # Get job_name-job_group mapping
    job_group_mapping = groups.set_index('job_name')
    heatmap_data = heatmap_data.join(job_group_mapping).sort_values('job_group').drop(columns='job_group')

    # Extract job_name-job_group mapping
    #job_group_mapping = province_data[['job_name', 'job_group']].drop_duplicates().set_index('job_name')
    #heatmap_data = heatmap_data.join(job_group_mapping).sort_values('job_group').drop(columns='job_group')

    # Create the heatmap with a better colormap scale
    plt.figure(figsize=(30, 10))
    heatmap = sns.heatmap(heatmap_data, cmap='YlGnBu', annot=True, fmt='d', linewidths=.5, annot_kws={"size": 12})

    # Add a color bar for interpretation
    cbar = heatmap.collections[0].colorbar
    cbar.set_label('Number of Candidates', fontsize=14)

    # Add titles and labels
    plt.title(f'การกระจายตัวของผู้สมัครตามกลุ่มและอำเภอใน {province_name}', fontsize=16)
    plt.xlabel('อำเภอ', fontsize=14)
    plt.ylabel('กลุ่ม', fontsize=14)
    plt.xticks(rotation=45, ha='right', fontsize=14)
    plt.yticks(rotation=0, fontsize=14)
  
    # Adjust layout and save the plot
    plt.tight_layout()
    plt.savefig(f"{output_dir}/heatmap_{province_name}.jpg", dpi=300)

    # Store province
    with open(f'heatmaps/{province_name}.txt', 'x') as f:
        f.write(str(heatmap_data.columns.to_list()
    ))
        
    # Create Array
    heatmap_data = heatmap_data.reset_index(drop=True)
    data_array = []
    for job_index, row in heatmap_data.iterrows():
        for province_index, count in enumerate(row[1:]):  # Skip the first column which is job_name
            data_array.append([province_index, job_index, count])
    with open(f'heatmaps/{province_name}_data.txt', 'x') as f:
        f.write(str(data_array))
    
# Iterate over each unique province to create individual heatmaps
for province in merged_df['province_name'].unique():
    create_province_heatmap(merged_df, province)
<ipython-input-111-c2699496bfc9>:18: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).
  plt.figure(figsize=(30, 10))
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image