WIPO Webscraping: Sort by Relevance - Little Science, Big Science, and Beyond

WIPO Webscraping: Sort by Relevance

This is an analysis of patents data from webscraping WIPO PATENTSCOPE using:
In the main page (https://patentscope.wipo.int/search/en/search.jsf), set “Field” to Publication Date. Then, write the first year of interest in the “Search terms...” section.
In the section right under the search bar, set “Sort” to Relevance and “Per page” to 200.

1Importing libraries¶

import pandas as pd 
import plotly.express as px
import country_converter as coco
import plotly.io as pio
from IPython.display import IFrame
pio.renderers.default = "plotly_mimetype"
cc = coco.CountryConverter()

2Importing data¶

pat = pd.read_csv("../data/amateur-science/pat.csv")

3Data Processing¶

Adding country names, codes and continents

pat["Full Country"] = cc.pandas_convert(series=pat["Office Country"], to='name_short', not_found = "NaN")

Function to get patent type

#function
def determine_status(row):
    applicant_types = {'research', 'group', 'society', 'foundation', 'inc', 'compania', 'lab', 'industries', 'societe', 'manufacturing', 'machine', 'co '
                       'corp', 'association', 'university', 'institute', 'company', 'llc', 'ltd', 'lfp', 'industria', 'industrie', 'firm', '+', 'co.',
                       'pharmaceuticals', 'roche', "l'oreal", 'campos', 'technologies', 'corp', 'inst', 'pharma', 'electronics', 'volvo', 'corporation',
                      'ltda', 'communications', 'ifp', 'technik', 'siemens','s.a', 'operations', 'limited', 'gmbh', 'novartis', 'agency',
                      'elektronik', 's.p.a', 'UNIWERSYTET', 's.l', 's.r.l', 'a.s','urs', 'ag ', 'UNIVERSITEIT', 'hospital', 'silverphase',
                      'sanofi', 'science', 'medicament', 'recherche', 'tech', 'international', 'networks', 'france', 'nucleix', 'cosmetique', 
                       'astrazeneca', 'universite', 'les ', 'igt', 'service', 'services', 'univ', 'products', 'product', 'bank', 'compan', 
                      'cotton', '& co', '&co', 'comp', 'constructions', 'meca', 'sciences', 'tech', 'consulting', ' spa', 'management', 'associates', 
                       'holdings', 'systems', ' as', ' co', 'electric', 'printing', 'steel', ' ind', 'chemicals', ' ag', 'a.g', "johnson & johnson"
                      ,'gm. b. h', 'informazioni', 'g. m. b. h.', 'anonyme', 'limitada', 'sociedad', 'solex', 's. a', 'eleuterio', 'societr',
                      'commissariat', 's.a', 'interlight', 's. l', 'electronique', 'moebius & ruppert', 'g m b h', 'elektro', 'società', 'energía',
                      'philips', '&', 's.c.i.', 'société', 'sté', 'g.m.b.h', 'energy', 'a. k','investigación', 'fabrica', 'limited'}
    if pd.notna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor and applicant are both not missing
        if isinstance(row['Inventor'], str) and isinstance(row['Applicant'], str):
            if row['Applicant'] in row['Inventor']:
                return "Solo Inventor"
            elif any(word in row['Applicant'].lower() for word in applicant_types):
                return "Research/Company"
            else:
                return "default"
    elif pd.isna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor is missing but not applicant
        if any(word in row['Applicant'].lower() for word in applicant_types):
            return "Research/Company"
        elif row['Applicant'] == 'applicant name missing':
            return 'default'
        else:
            return "Solo Inventor"
    elif pd.notna(row['Inventor']) and pd.isna(row['Applicant']):#if applicant is missing but not inventor
        # if (row['Year']) < 1920:
            return "Solo Inventor"
        # else: return "default 2"
    else:
        return "default"

Converting to date, applying function to extract patent type, grouping by year and patent type

#convert date string to actual date
pat["Publication date"] = pd.to_datetime(pat["Publication date"], format="%d.%m.%Y")
#converting date to year
pat["Year"] = pat["Publication date"].dt.year
#apply method to get patent type
pat['Type'] = pat.apply(determine_status, axis=1)
pat = pat.groupby(['Year', 'Type', 'Full Country']).size().reset_index(name='Patents')

Getting proportions

#dropping default patents
pat_proportions = pat[pat["Type"] != "default"]
#getting sum of patents per year
total_per_year = pat_proportions.groupby('Year')['Patents'].transform('sum')
#getting proportions of patent types
pat_proportions['Proportion'] = pat_proportions['Patents'] / total_per_year
pat_proportions= pat_proportions.groupby(["Year", "Type"]).sum().reset_index()

3.1Data Visualization: Stacked Bars¶

fig1 = px.bar(pat_proportions
             , x="Year"
             , y="Proportion"
             , color = "Type"
             , template= "plotly_white"
             , color_discrete_sequence=['#F7C0BB', '#EB5E51']
             ,title="<b>Worldwide Patent Applications Classification</b>"
             , height= 400
             , width = 1100
              , hover_name = "Year"
             , hover_data = ["Patents", "Proportion"]
             , orientation='v')
fig1.update_xaxes(title = None, dtick=5, ticks="outside",ticklen=4,range=[1910,2023]) 
fig1.update_yaxes(title = "Relative Proportion", tickformat= ',.0%')
fig1.update_layout(legend_title = None,
                   font_family="Calibri",
                   font_color="black",
                   title_font_family="Calibri",
                   font=dict(size=14),
                   title_font_color="black") 
annotations = [
      {'text': "Source: WIPO Patentscope", 'showarrow': False, 'x': 0.99, 'y': -0.23, 'xref': 'paper', 'yref': 'paper','font':{'size':11, 'color':"grey"}}
]
for annotation in annotations:
    fig1.add_annotation(annotation)
fig1.show()

IFrame(src='https://janeabdo.github.io/carousel/', width='800', height='700')