%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from interface.radar_factory import build_single_radar, build_multiple_radar, build_single_radar_free
base = pd.read_csv("../data/InterfaceReview-June2019.tsv", sep="\t", index_col= [0,1], skipinitialspace=True)
base.index
base.loc[('newspaper metadata')]
base.loc['information on digitization','OCR confidence scores'].describe()
base.loc['newspaper metadata','Place of publication']
# about multilingual collections
base.loc['newspaper collection', 'Languages of the collections'].value_counts()
# about multilingual collections
base.loc['interface', 'Access model'].value_counts()
# about multilingual collections
base.loc['interface', 'Interface provider'].value_counts()
def trim_all_columns(df):
"""
Trim whitespace from ends of each value across all series in dataframe
"""
trim_strings = lambda x: x.strip() if type(x) is str else x
return df.applymap(trim_strings)
# trim
base = trim_all_columns(base)
# removing non binary features
base = base.drop('interface', level=0)
base = base.drop('Languages of the collections', level=1) # from 'newspaper collection'
base = base.drop('Number of pages', level=1) # from 'newspaper collection'
base = base.drop('Number of articles', level=1) # from 'newspaper collection'
base = base.drop('Number of issues', level=1) # from 'newspaper collection'
base = base.drop('Number of newspaper titles', level=1) # from 'newspaper collection'
base = base.drop('Newspaper date range', level=1) # from 'newspaper collection'
base = base.drop('Other', level=1)
base = base.drop('Download options (file formats)', level=1)
base = base.replace(to_replace=['y', 'y?', 'y (annotations)', 'y (requires user account - free)', 'n', '?', 'u', 'n?'],
value=[1,1,1,1,0,0,0,0])
base.head()
# Sum values of level 1 => 'grade' of each interface per family of features
level_0 = base.groupby(level=0).sum(axis=1)
# re-order index
level_0 = level_0.reindex(["newspaper metadata",
"apis",
"connectivity",
"info on digitization",
"enrichment",
"user interaction",
"viewer",
"result display",
"result filtering",
"result sorting",
"search",
"browsing"
])
level_0.head()
#Total sum per row, ie. per feature family (=> how good are all interfaces for a certain aspect):
level_0.loc[:,'Total'] = level_0.sum(axis=1)
level_0['Total (%)'] = level_0['Total']/level_0['Total'].sum() * 100
level_0.head()
# getting the labels
labels = level_0.index
# Get all rows, just for the first columns, and transpose it (for the radar factory)
level_0.iloc[:12, :1].T
# take only the values
first_interface = level_0.iloc[:12, :1].T.values
build_single_radar(labels, values=first_interface, title=level_0.columns[0], grid=[2,4,6,8], figure_title='Anno')
# take the data: all rows and up to the 23th columns (selecting precisely in case Total columns are added)
all_interfaces_counts = level_0.iloc[:12, :23].T.values
# same with percentages
level_0_percent = base.groupby(level=0).sum(axis=1).apply(lambda x: 100*x/float(x.sum()))
level_0_percent.head()
# checking we have 100 everywhere
level_0_percent.sum()
level_0_percent.max().max()
all_interfaces_percents = level_0_percent.iloc[:12, :23].T.values
build_multiple_radar(labels, all_interfaces_percents, level_0.columns[:23], 'all-interfaces-single')
# take only the last column: Total per feature family, in percent
values = level_0.iloc[:,24:].T.values
build_single_radar(labels, values, "All interfaces",grid=[5,10,15,20],figure_title='all-interfaces-global')
metadata = base.loc['newspaper metadata'].copy()
metadata.loc[:,'Total'] = metadata.sum(axis=1)
metadata['Total (%)'] = metadata['Total']/metadata['Total'].sum() * 100
metadata.loc[:,'Total']
values_metadata = metadata.iloc[:,25:].T.values
build_single_radar_free(metadata.index, values_metadata, title="Newspaper metadata", figure_title="metadata-global")
browsing = base.loc['browsing'].copy()
browsing.loc[:,'Total'] = browsing.sum(axis=1)
browsing['Total (%)'] = browsing['Total']/browsing['Total'].sum() * 100
browsing.loc[:,'Total']
values_browsing = browsing.iloc[:,25:].T.values
build_single_radar_free(browsing.index, values_browsing, title="Browsing", figure_title="browsing-global")
search = base.loc['search'].copy()
search.loc[:,'Total'] = search.sum(axis=1)
search['Total (%)'] = search['Total']/search['Total'].sum() * 100
search['Total']
values_search = search.iloc[:,25:].T.values
build_single_radar_free(search.index, values_search, title="Search", figure_title="search-global")
rd = base.loc['result display'].copy()
rd.loc[:,'Total'] = rd.sum(axis=1)
rd['Total (%)'] = rd['Total']/rd['Total'].sum() * 100
rd['Total']
values_rd = rd.iloc[:,25:].T.values
build_single_radar_free(rd.index, values_rd, title="Result display", figure_title="result-display-global")
rf = base.loc['result filtering'].copy()
rf.loc[:,'Total'] = rf.sum(axis=1)
rf['Total (%)'] = rf['Total']/rf['Total'].sum() * 100
rf['Total']
values_rf = rf.iloc[:,25:].T.values
build_single_radar_free(rf.index, values_rf, title="Result filtering", figure_title="result-filtering-global")
rs = base.loc['result sorting'].copy()
rs.loc[:,'Total'] = rs.sum(axis=1)
rs['Total (%)'] = rs['Total']/rs['Total'].sum() * 100
rs['Total']
values_rs = rs.iloc[:,25:].T.values
build_single_radar_free(rs.index, values_rs, title="Result sorting", figure_title="result-sorting-global")
viewer = base.loc['viewer'].copy()
viewer.loc[:,'Total'] = viewer.sum(axis=1)
viewer['Total (%)'] = viewer['Total']/viewer['Total'].sum() * 100
viewer.loc[:,'Total']
values_viewer = viewer.iloc[:,25:].T.values
build_single_radar_free(viewer.index, values_viewer, title="Viewer", figure_title="viewer-global")
info = base.loc['info on digitization'].copy()
info.loc[:,'Total'] = info.sum(axis=1)
info['Total (%)'] = info['Total']/info['Total'].sum() * 100
info.loc[:,'Total']
user = base.loc['user interaction'].copy()
user.loc[:,'Total'] = user.sum(axis=1)
user['Total (%)'] = user['Total']/user['Total'].sum() * 100
user['Total']
enrich = base.loc['enrichment'].copy()
enrich.loc[:,'Total'] = enrich.sum(axis=1)
enrich['Total (%)'] = enrich['Total']/enrich['Total'].sum() * 100
enrich['Total']
values_enrich = enrich.iloc[:,25:].T.values
build_single_radar_free(enrich.index, values_enrich, title="Enrichment", figure_title="enrichment-global")
enrich = base.loc['enrichment'].copy()
enrich.loc[:,'Total'] = enrich.sum(axis=1)
enrich['Total (%)'] = enrich['Total']/enrich['Total'].sum() * 100
enrich['Total']
values_enrich = enrich.iloc[:,25:].T.values
build_single_radar_free(enrich.index, values_enrich, title="Enrichment", figure_title="enrichment-global")
apis = base.loc['apis'].copy()
apis.loc[:,'Total'] = apis.sum(axis=1)
apis['Total (%)'] = apis['Total']/conn['Total'].sum() * 100
apis['Total']
values_apis = apis.iloc[:,25:].T.values
build_single_radar_free(apis.index, values_apis, title="APIs", figure_title="apis-global")
base = pd.read_csv("../data/InterfaceReview-June2019.tsv", sep="\t", index_col= [0,1], skipinitialspace=True)
pages = base.loc['newspaper collection','Number of pages'].copy()
pages
#objects = pages.index
#y_pos = np.arange(len(objects))
#performance = pages.values
#plt.bar(y_pos, performance, align='center', alpha=0.5, width=0.5)
#plt.xticks(y_pos, objects)
#plt.ylabel('Usage')
#plt.title('Number of pages')
#plt.show()
"newspaper metadata",
"apis",
"connectivity",
"info on digitization",
"enrichment",
"user interaction",
"viewer",
"result display",
"result filtering",
"result sorting",
"search",
"browsing"
level1 = base.droplevel(0)
level1.loc[:,'Total'] = level1.sum(axis=0)
level1