Little Science, Big Science, and Beyond
How Amateurs Shape the Scientific Landscape
WIPO Webscraping: Sort by Relevance
This is an analysis of patents data from webscraping WIPO PATENTSCOPE using:
In the main page (https://
In the section right under the search bar, set “Sort” to Relevance and “Per page” to 200.
1Importing libraries¶
import pandas as pd
import plotly.express as px
import country_converter as coco
import plotly.io as pio
from IPython.display import IFrame
pio.renderers.default = "plotly_mimetype"
cc = coco.CountryConverter()2Importing data¶
pat = pd.read_csv("../data/amateur-science/pat.csv")3Data Processing¶
Adding country names, codes and continents
pat["Full Country"] = cc.pandas_convert(series=pat["Office Country"], to='name_short', not_found = "NaN")Function to get patent type
#function
def determine_status(row):
applicant_types = {'research', 'group', 'society', 'foundation', 'inc', 'compania', 'lab', 'industries', 'societe', 'manufacturing', 'machine', 'co '
'corp', 'association', 'university', 'institute', 'company', 'llc', 'ltd', 'lfp', 'industria', 'industrie', 'firm', '+', 'co.',
'pharmaceuticals', 'roche', "l'oreal", 'campos', 'technologies', 'corp', 'inst', 'pharma', 'electronics', 'volvo', 'corporation',
'ltda', 'communications', 'ifp', 'technik', 'siemens','s.a', 'operations', 'limited', 'gmbh', 'novartis', 'agency',
'elektronik', 's.p.a', 'UNIWERSYTET', 's.l', 's.r.l', 'a.s','urs', 'ag ', 'UNIVERSITEIT', 'hospital', 'silverphase',
'sanofi', 'science', 'medicament', 'recherche', 'tech', 'international', 'networks', 'france', 'nucleix', 'cosmetique',
'astrazeneca', 'universite', 'les ', 'igt', 'service', 'services', 'univ', 'products', 'product', 'bank', 'compan',
'cotton', '& co', '&co', 'comp', 'constructions', 'meca', 'sciences', 'tech', 'consulting', ' spa', 'management', 'associates',
'holdings', 'systems', ' as', ' co', 'electric', 'printing', 'steel', ' ind', 'chemicals', ' ag', 'a.g', "johnson & johnson"
,'gm. b. h', 'informazioni', 'g. m. b. h.', 'anonyme', 'limitada', 'sociedad', 'solex', 's. a', 'eleuterio', 'societr',
'commissariat', 's.a', 'interlight', 's. l', 'electronique', 'moebius & ruppert', 'g m b h', 'elektro', 'società', 'energía',
'philips', '&', 's.c.i.', 'société', 'sté', 'g.m.b.h', 'energy', 'a. k','investigación', 'fabrica', 'limited'}
if pd.notna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor and applicant are both not missing
if isinstance(row['Inventor'], str) and isinstance(row['Applicant'], str):
if row['Applicant'] in row['Inventor']:
return "Solo Inventor"
elif any(word in row['Applicant'].lower() for word in applicant_types):
return "Research/Company"
else:
return "default"
elif pd.isna(row['Inventor']) and pd.notna(row['Applicant']): #if inventor is missing but not applicant
if any(word in row['Applicant'].lower() for word in applicant_types):
return "Research/Company"
elif row['Applicant'] == 'applicant name missing':
return 'default'
else:
return "Solo Inventor"
elif pd.notna(row['Inventor']) and pd.isna(row['Applicant']):#if applicant is missing but not inventor
# if (row['Year']) < 1920:
return "Solo Inventor"
# else: return "default 2"
else:
return "default"Converting to date, applying function to extract patent type, grouping by year and patent type
#convert date string to actual date
pat["Publication date"] = pd.to_datetime(pat["Publication date"], format="%d.%m.%Y")
#converting date to year
pat["Year"] = pat["Publication date"].dt.year
#apply method to get patent type
pat['Type'] = pat.apply(determine_status, axis=1)
pat = pat.groupby(['Year', 'Type', 'Full Country']).size().reset_index(name='Patents')Getting proportions
#dropping default patents
pat_proportions = pat[pat["Type"] != "default"]
#getting sum of patents per year
total_per_year = pat_proportions.groupby('Year')['Patents'].transform('sum')
#getting proportions of patent types
pat_proportions['Proportion'] = pat_proportions['Patents'] / total_per_year
pat_proportions= pat_proportions.groupby(["Year", "Type"]).sum().reset_index()3.1Data Visualization: Stacked Bars¶
fig1 = px.bar(pat_proportions
, x="Year"
, y="Proportion"
, color = "Type"
, template= "plotly_white"
, color_discrete_sequence=['#F7C0BB', '#EB5E51']
,title="<b>Worldwide Patent Applications Classification</b>"
, height= 400
, width = 1100
, hover_name = "Year"
, hover_data = ["Patents", "Proportion"]
, orientation='v')
fig1.update_xaxes(title = None, dtick=5, ticks="outside",ticklen=4,range=[1910,2023])
fig1.update_yaxes(title = "Relative Proportion", tickformat= ',.0%')
fig1.update_layout(legend_title = None,
font_family="Calibri",
font_color="black",
title_font_family="Calibri",
font=dict(size=14),
title_font_color="black")
annotations = [
{'text': "Source: WIPO Patentscope", 'showarrow': False, 'x': 0.99, 'y': -0.23, 'xref': 'paper', 'yref': 'paper','font':{'size':11, 'color':"grey"}}
]
for annotation in annotations:
fig1.add_annotation(annotation)
fig1.show()IFrame(src='https://janeabdo.github.io/carousel/', width='800', height='700')