import pandas as pd
import re
import os

from selenium import webdriver
from bs4 import BeautifulSoup


pd.set_option("display.max_colwidth", None)
pd.set_option('display.max_columns', None)


def indeed_jobs(keyword, location = '', pages = 1, age = 30): 
    job_dict = dict()
    q_val = 'q='+'+'.join(keyword.split())
    l_val = '&l='+location if location != '' else ''
    fromage_val = '&fromage='+str(age)
    start_val = '&start=' 
    
    for page in range(1, pages+1):
        url = 'https://www.indeed.com/jobs?'+q_val+l_val+fromage_val+start_val+str((page*10)-10)
        dr = webdriver.Chrome()
        dr.get(url)
        bs = BeautifulSoup(dr.page_source,"html")
        
        for block in bs.find_all('script'):
            if '"jobKeysWithTwoPaneEligibility":' in str(block):
                job_ids= eval(re.search('"jobKeysWithTwoPaneEligibility":({[\w\"\"\,\:]+\})', str(block)).group(1).replace('true', 'True'))
                for job_id in job_ids.keys():
                    try:
                        job_detail_url = 'https://www.indeed.com/viewjob?jk='+job_id
                        dr.get(job_detail_url)
                        bs_jd = BeautifulSoup(dr.page_source,"html")
                        job_title = bs_jd.find('h1').find('span').text
                        job_description = bs_jd.find('div',attrs={'id':'jobDescriptionText'}).text.strip()
                        job_dict[job_id] = {'url': job_detail_url, 'title':job_title, 'description': job_description}
                    except:
                        print(f'Job: {job_id} failed - Ignoring')
                        continue
        dr.close()           
    return job_dict


def scan_jobs(job_dict, skill_dict):
    job_df = pd.DataFrame({'Title':[], 'URL':[]})
    for skill in skill_dict.keys():
        job_df.insert(len(job_df.columns), skill, [], False)
        
    for job in job_dict.keys():
        title = job_dict[job]['title']
        url = job_dict[job]['url']
        job_df_row = {'Title':[title], 'URL':[url]}
        
        for skill in skill_dict.keys():
            job_df_row[skill] = False
            for keyword in skill_dict[skill]:
                if keyword.lower() in job_dict[job]['description'].lower(): 
                    job_df_row[skill] = True
                
        job_df = pd.concat([job_df, pd.DataFrame(job_df_row)], ignore_index = True)
        job_df.drop_duplicates(inplace=True)
    return job_df


def html_communication(cadidate_name, candidate_profile, job_df, skill_dict, position):
    summary = pd.melt(job_df, value_vars=skill_dict.keys(), var_name = 'Skills', value_name='Count').groupby('Skills').sum().sort_values('Count', ascending=False).head(10)
    summary['%'] = round((summary['Count'] / result.shape[0]) * 100,2)
    
    candidate_recomendation = job_df
    candidate_skills = ['Title', 'URL']
    for skill in candidate_profile:
        if candidate_profile[skill] == True:
            candidate_skills.append(skill)
    candidate_recomendation = candidate_recomendation[candidate_skills]
    candidate_recomendation['total'] =  candidate_recomendation.iloc[:, 2:].sum(axis = 1)
    candidate_recomendation = candidate_recomendation.sort_values('total', ascending=False).head(5)
    
    Job_table = '<table><tr><th>Job Title</th><th>Link</th></tr>'
    for indx, row in candidate_recomendation[['Title', 'URL']].iterrows():
        title = row[0]
        url = row[1]
        table_row = f'<tr><td>{title}</td><td><a href="{str(url)}" target="_blank">Apply here!</a></td></tr>\n'
        Job_table += table_row
    Job_table += '</table>'
    
    top_skill_table = '<table><tr><th>Skill</th><th>Percentage</th></tr>'
    for indx, row in summary.iterrows():
        skill = indx
        percentage = row[1]
        table_row = f'<tr><td>{skill}</td><td>{percentage}%</td></tr>\n'
        top_skill_table += table_row
    top_skill_table += '</table>'
    
    style_tag = '''<style>
                div{
                    margin: auto;width: 90%;
                    padding: 10px;
                    font-family: Arial;
                }
                table {
                  font-family: arial, sans-serif;
                  border-collapse: collapse;
                  width: 100%;
                }
                
                td, th {
                  border: 1px solid #dddddd;
                  text-align: left;
                  padding: 8px;
                }
                
                tr:nth-child(even) {
                  background-color: #dddddd;
                }
                @media (min-width: 650px) {
                    div{
                        margin: auto;width: 50%;
                    }
                }
                </style>'''
    
    output_html = f'''<!doctype html>
            <html lang="en">
            <head>
                <title>Your Daily Job Recommendation</title>
                <meta name="viewport" content="width=device-width,initial-scale=1">
            </head>
            {style_tag}
            <body>
                <div>
                    <h1>Hello {cadidate_name}!</h1>
                    <h2>Are you ready to take the next step?</h2>
                    <p>We've found positions that fit your profile:</p>
                    {Job_table}
                    <h2>Do you know?</h2>
                    <p>As per our last analysis, these are the top 10 skill demanded for {position}:
                    {top_skill_table}
                </div>    
            </body>
            </html>'''
    return output_html


position = 'Data Analyst'
pages = 150
age = 30


job_dict = indeed_jobs(position, pages = pages, age = age)

Job: 41b53f4529a4b14c failed - Ignoring
Job: 56f150b103d4e40c failed - Ignoring
Job: 2c317bb86965da31 failed - Ignoring


job_dict[list(job_dict.keys())[0]]

{'url': 'https://www.indeed.com/viewjob?jk=f69950156bbdac29',
 'title': 'Data Analyst',
 'description': "At Alpine IQ, we're passionate about delivering innovative and effective software solutions to our clients. Our commitment to data-driven decision-making drives our success, and we're looking for a Data Analyst to join our team. This is an exciting opportunity for individuals looking to launch their career in the dynamic world of Software as a Service (SaaS) and make a significant impact in a fast-growing company.\n\n At Alpine IQ, the Data Analyst is a pivotal role, tasked with unlocking the power of data to drive insights for our Sales and Customer Success teams. With a keen eye for detail and a passion for numbers, you will be responsible for designing and maintaining dashboards that provide actionable insights into our sales processes, customer journeys, and success metrics. If you're driven by the challenge of converting complex data into compelling business narratives, we would love to hear from you.\n General Duties & Responsibilities\n Develop, Implement and Maintain a Comprehensive Ticketing System:\n\nArchitect a ticketing system within our productivity management platform to streamline, prioritize and manage incoming data requests.\nIntegrate a robust data definition framework and incorporate discovery questions, ensuring precise and meaningful report generation for each request.\n\nDashboard Design & Maintenance:\n\nDesign, develop, and update intuitive dashboards for the Sales and Customer Success teams using tools like Tableau, Power BI, or Looker.\nEnsure dashboards provide real-time, accurate, and relevant insights.\n\nData Management:\n\nExtract, clean, and analyze data from various sources to ensure its integrity and relevance.\nCollaborate with the engineering team for seamless data integration.\n\nCollaboration with Sales Customer Success Teams and Marketing teams:\n\nEngage regularly with the Sales Customer Success and Marketing teams to comprehend their data requirements and challenges.\nOffer strategic insights to optimize sales performance and elevate customer success outcomes.\n\nContinuous Learning and Optimization:\n\nStay abreast with emerging trends in data analysis and visualization.\nAdvocate for enhancements in data collection and interpretation methodologies.\n\nReporting and Insights Communication:\n\nCraft regular reports for different stakeholders, from team leads to executive members.\nPresent data-backed recommendations to Alpine IQ's leadership team.\n\nSkills & Competencies\n\nMastery of data visualization tools like Tableau, Power BI, or Looker.\nStrong analytical and problem-solving skills.\nAbility to translate data into actionable business insights.\nExceptional communication skills, both written and verbal.\nProficient in SQL and understanding of data warehouse structures.\nTeam player with the ability to function in a cross-functional environment.\n\nPreferred Experience & Education\n\nBachelor's degree in Data Science, Statistics, Business, or a related field.\n2-4 years of experience in a data analyst role, preferably within a SaaS company.\nProven experience in designing and maintaining business dashboards.\nFamiliarity with the SaaS industry and its key metrics would be a plus.\nExperience in Hubspot CRM"}


skill_dict = {'Databases': ['SQL', 'Transact-SQL', 'Sequel', 'Database', 'MySQL', 'Postgres', 'PostgreSQL', 'DB2','Oracle DB','OracleDB', 'MS Access', 'MSAccess', 'Microsoft Access'],
              'Non-Relational Databases':['MongoDB'],
              'Python': ['Python', 'Jupyter'], 
              'R (Programming Language)': [' R ', ', R,', ' R,',  ' R.',',R,'],
              'JavaScript': ['JavaScript'],
              'Java':['Java ', 'Java,', 'Java.'], 
              'PHP':['PHP'],
              'HTML':['HTML'],
              'Spreadsheets':['Excel ', 'Excel,','Excel.', 'spreadsheet', 'Google Sheets'], 
              'VBA':['VBA'], 
              'SAS':['SAS'],
              'SAP':[' SAP ', ', SAP', ',SAP,', ' SAP.'],
              'SPSS':['SPSS'],
              'Tableau':['Tableau'], 
              'Power BI':['Power BI', 'PowerBI'], 
              'Qlik': ['Qlik'],
              'Power Point':['Power Point', 'PowerPoint'],
              'ETL':['ETL', 'ELT'], 
              'Databricks':['Data bricks', 'Databricks'],
              'Snowflake':['Snowflake'],
              'Google Analytics': ['Google Analytics'],
              'Statisticts/Mathematics':['Statistic','Math'],
              'Linux':['Linux', 'Ubuntu', 'Debian', 'CentOS', 'Red Hat', 'RedHat','RHEL'],
              'Version Control Systems': [' Git ',' Git,', ',Git,',' Git.', 'GitHub', 'GitLab']}


result = scan_jobs(job_dict, skill_dict)

C:\Users\perez\AppData\Local\Temp\ipykernel_17384\4000243932.py:17: FutureWarning: Behavior when concatenating bool-dtype and numeric-dtype arrays is deprecated; in a future version these will cast to object dtype (instead of coercing bools to numeric values). To retain the old behavior, explicitly cast bool-dtype arrays to numeric dtype.
  job_df = pd.concat([job_df, pd.DataFrame(job_df_row)], ignore_index = True)


result.head()


result.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 916 entries, 0 to 915
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Title                     916 non-null    object 
 1   URL                       916 non-null    object 
 2   Databases                 916 non-null    float64
 3   Non-Relational Databases  916 non-null    float64
 4   Python                    916 non-null    float64
 5   R (Programming Language)  916 non-null    float64
 6   JavaScript                916 non-null    float64
 7   Java                      916 non-null    float64
 8   PHP                       916 non-null    float64
 9   HTML                      916 non-null    float64
 10  Spreadsheets              916 non-null    float64
 11  VBA                       916 non-null    float64
 12  SAS                       916 non-null    float64
 13  SAP                       916 non-null    float64
 14  SPSS                      916 non-null    float64
 15  Tableau                   916 non-null    float64
 16  Power BI                  916 non-null    float64
 17  Qlik                      916 non-null    float64
 18  Power Point               916 non-null    float64
 19  ETL                       916 non-null    float64
 20  Databricks                916 non-null    float64
 21  Snowflake                 916 non-null    float64
 22  Google Analytics          916 non-null    float64
 23  Statisticts/Mathematics   916 non-null    float64
 24  Linux                     916 non-null    float64
 25  Version Control Systems   916 non-null    float64
dtypes: float64(24), object(2)
memory usage: 193.2+ KB


result.to_csv('results.csv',  index=False)


summary = pd.melt(result, value_vars=skill_dict.keys(), var_name = 'Skills', value_name='Count').groupby('Skills').sum().sort_values('Count', ascending=False)
summary['Percentage'] = round((summary['Count'] / result.shape[0]) * 100,2)
summary


summary.rename(index={'R (Programming Language)': 'R', 'Statisticts/Mathematics':'Stats/Maths', 'Non-Relational Databases':'NR-Databases', 'Version Control Systems':'VCS', 'Google Analytics': 'G. Analytics'}).plot.bar(y='Percentage',title = f'Most demanded skills for the role of: {position}', rot=35, fontsize = 8, figsize = (10, 5)).grid(axis='y')


Laura_profile = {'Databases': False,
              'Non-Relational Databases':False,
              'Python':False, 
              'R (Programming Language)': False,
              'JavaScript':False,
              'Java':False, 
              'PHP':False,
              'HTML':False,
              'Spreadsheets':True, 
              'VBA':True, 
              'SAS':True,
              'SAP':False,
              'SPSS':False,
              'Tableau':True, 
              'Power BI':True, 
              'Qlik': False,
              'Power Point':False,
              'ETL':False, 
              'Databricks':False,
              'Snowflake':False,
              'Google Analytics':False,
              'Statisticts/Mathematics':False,
              'Linux':False,
              'Version Control Systems': False}


Robert_profile = {'Databases': True,
              'Non-Relational Databases':False,
              'Python':True, 
              'R (Programming Language)': True,
              'JavaScript':False,
              'Java':False, 
              'PHP':False,
              'HTML':False,
              'Spreadsheets':True, 
              'VBA':False, 
              'SAS':False,
              'SAP':False,
              'SPSS':False,
              'Tableau':False, 
              'Power BI':True, 
              'Qlik': False,
              'Power Point':False,
              'ETL':False, 
              'Databricks':False,
              'Snowflake':False,
              'Google Analytics':False,
              'Statisticts/Mathematics':False,
              'Linux':True,
              'Version Control Systems': True}


Laura_email = html_communication('Laura', Laura_profile, result, skill_dict, position)
Robert_email = html_communication('Robert', Robert_profile, result, skill_dict, position)

C:\Users\perez\AppData\Local\Temp\ipykernel_17384\1074182653.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidate_recomendation['total'] =  candidate_recomendation.iloc[:, 2:].sum(axis = 1)
C:\Users\perez\AppData\Local\Temp\ipykernel_17384\1074182653.py:11: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidate_recomendation['total'] =  candidate_recomendation.iloc[:, 2:].sum(axis = 1)


print(Laura_email)

<!doctype html>
            <html lang="en">
            <head>
                <title>Your Daily Job Recommendation</title>
                <meta name="viewport" content="width=device-width,initial-scale=1">
            </head>
            <style>
                div{
                    margin: auto;width: 90%;
                    padding: 10px;
                    font-family: Arial;
                }
                table {
                  font-family: arial, sans-serif;
                  border-collapse: collapse;
                  width: 100%;
                }
                
                td, th {
                  border: 1px solid #dddddd;
                  text-align: left;
                  padding: 8px;
                }
                
                tr:nth-child(even) {
                  background-color: #dddddd;
                }
                @media (min-width: 650px) {
                    div{
                        margin: auto;width: 50%;
                    }
                }
                </style>
            <body>
                <div>
                    <h1>Hello Laura!</h1>
                    <h2>Are you ready to take the next step?</h2>
                    <p>We've found positions that fit your profile:</p>
                    <table><tr><th>Job Title</th><th>Link</th></tr><tr><td>Staff Data Analyst, Credit Analytics</td><td><a href="https://www.indeed.com/viewjob?jk=918ad336af7e09ae" target="_blank">Apply here!</a></td></tr>
<tr><td>Data Analyst Mid to Senior</td><td><a href="https://www.indeed.com/viewjob?jk=b4bf6ec3e4a1a893" target="_blank">Apply here!</a></td></tr>
<tr><td>Data Analyst (full-time)</td><td><a href="https://www.indeed.com/viewjob?jk=8ed44fc2b94856ce" target="_blank">Apply here!</a></td></tr>
<tr><td>Data Analyst</td><td><a href="https://www.indeed.com/viewjob?jk=14f020e039a577ed" target="_blank">Apply here!</a></td></tr>
<tr><td>Data Analyst - Defense</td><td><a href="https://www.indeed.com/viewjob?jk=094f3784e07bdd98" target="_blank">Apply here!</a></td></tr>
</table>
                    <h2>Do you know?</h2>
                    <p>As per our last analysis, these are the top 10 skill demanded for Data Analyst:
                    <table><tr><th>Skill</th><th>Percentage</th></tr><tr><td>Databases</td><td>65.07%</td></tr>
<tr><td>Statisticts/Mathematics</td><td>43.56%</td></tr>
<tr><td>Spreadsheets</td><td>40.83%</td></tr>
<tr><td>Tableau</td><td>27.51%</td></tr>
<tr><td>Python</td><td>25.33%</td></tr>
<tr><td>Power BI</td><td>23.69%</td></tr>
<tr><td>SAS</td><td>14.3%</td></tr>
<tr><td>R (Programming Language)</td><td>13.32%</td></tr>
<tr><td>Power Point</td><td>12.12%</td></tr>
<tr><td>ETL</td><td>8.52%</td></tr>
</table>
                </div>    
            </body>
            </html>


f = open("Laura.html", "w")
f.write(Laura_email)
f.close()

f = open("Robert.html", "w")
f.write(Robert_email)
f.close()

	Title	URL	Databases	Python	R (Programming Language)	Spreadsheets	SAS	Tableau	Power BI	Statisticts/Mathematics
0	Data Analyst	https://www.indeed.com/viewjob?jk=f69950156bbdac29	1.0	0.0	0.0	0.0	0.0	1.0	1.0	1.0
1	Data Analyst	https://www.indeed.com/viewjob?jk=98d14a169483ded8	1.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0
2	Data Analyst	https://www.indeed.com/viewjob?jk=2b79c482e030df68	1.0	0.0	1.0	1.0	0.0	1.0	1.0	1.0
3	Data Analyst I	https://www.indeed.com/viewjob?jk=138bd3c14de00e5f	1.0	1.0	1.0	0.0	1.0	0.0	0.0	0.0
4	Data Analyst	https://www.indeed.com/viewjob?jk=0ab722268e5631e5	1.0	0.0	0.0	0.0	0.0	0.0	0.0	1.0

	Count	Percentage
Skills
Databases	596.0	65.07
Statisticts/Mathematics	399.0	43.56
Spreadsheets	374.0	40.83
Tableau	252.0	27.51
Python	232.0	25.33
Power BI	217.0	23.69
SAS	131.0	14.30
R (Programming Language)	122.0	13.32
Power Point	111.0	12.12
ETL	78.0	8.52
SAP	40.0	4.37
Snowflake	39.0	4.26
HTML	30.0	3.28
SPSS	27.0	2.95
VBA	26.0	2.84
Qlik	25.0	2.73
Java	21.0	2.29
JavaScript	18.0	1.97
Databricks	16.0	1.75
Version Control Systems	14.0	1.53
PHP	12.0	1.31
Linux	12.0	1.31
Google Analytics	10.0	1.09
Non-Relational Databases	3.0	0.33

Scrape and analyze data analyst job requirements with Python¶

Coursera Project Network¶

By Cesar Perez¶

Introduction¶

Defining Functions.¶

1.Web-Scraping function.¶

2. Job Scaning function.¶

3.Candidate communication function¶

Creating Job Recomendations¶

What's next?¶