In [1]:
%matplotlib inline
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from interface.radar_factory import build_single_radar, build_multiple_radar, build_single_radar_free

Preparing data

Loading the review spreasheet as a multi-index (tsv)

In [2]:
base = pd.read_csv("../data/InterfaceReview-June2019.tsv", sep="\t", index_col= [0,1], skipinitialspace=True)
In [ ]:
base.index

Testing some indexing

In [ ]:
base.loc[('newspaper metadata')]
In [ ]:
base.loc['information on digitization','OCR confidence scores'].describe()
In [ ]:
base.loc['newspaper metadata','Place of publication']
In [ ]:
# about multilingual collections
base.loc['newspaper collection', 'Languages of the collections'].value_counts()
In [ ]:
# about multilingual collections
base.loc['interface', 'Access model'].value_counts()
In [ ]:
# about multilingual collections
base.loc['interface', 'Interface provider'].value_counts()

Trim string to be sure

In [3]:
def trim_all_columns(df):
    """
    Trim whitespace from ends of each value across all series in dataframe
    """
    trim_strings = lambda x: x.strip() if type(x) is str else x
    return df.applymap(trim_strings)

# trim
base = trim_all_columns(base)

Remove undesirable rows (i.e. not entirely binary)

In [4]:
# removing non binary features
base = base.drop('interface', level=0)
base = base.drop('Languages of the collections', level=1) # from 'newspaper collection'
base = base.drop('Number of pages', level=1) # from 'newspaper collection'
base = base.drop('Number of articles', level=1) # from 'newspaper collection'
base = base.drop('Number of issues', level=1) # from 'newspaper collection'
base = base.drop('Number of newspaper titles', level=1) # from 'newspaper collection'
base = base.drop('Newspaper date range', level=1) # from 'newspaper collection'

base = base.drop('Other', level=1)
base = base.drop('Download options (file formats)', level=1)

Replace n and y by 0 and 1 (ideally to clean in spreadsheet)

In [5]:
base = base.replace(to_replace=['y', 'y?', 'y (annotations)', 'y (requires user account - free)', 'n', '?', 'u', 'n?'], 
                    value=[1,1,1,1,0,0,0,0])
In [ ]:
base.head()

Interface charts

Counts

In [ ]:
# Sum values of level 1 => 'grade' of each interface per family of features
level_0 = base.groupby(level=0).sum(axis=1)
In [ ]:
# re-order index
level_0 = level_0.reindex(["newspaper metadata", 
                 "apis",
                 "connectivity",
                 "info on digitization",
                 "enrichment",
                 "user interaction",
                 "viewer",
                 "result display",
                 "result filtering",
                 "result sorting",
                 "search",
                 "browsing"
                    ])
In [ ]:
level_0.head()

Observations per feature family

In [ ]:
#Total sum per row, ie. per feature family (=> how good are all interfaces for a certain aspect): 
level_0.loc[:,'Total'] = level_0.sum(axis=1)
In [ ]:
level_0['Total (%)'] = level_0['Total']/level_0['Total'].sum() * 100
In [ ]:
level_0.head()

Test radar with one interface

In [ ]:
# getting the labels
labels = level_0.index
In [ ]:
# Get all rows, just for the first columns, and transpose it (for the radar factory)
level_0.iloc[:12, :1].T
In [ ]:
# take only the values
first_interface = level_0.iloc[:12, :1].T.values
In [ ]:
build_single_radar(labels, values=first_interface, title=level_0.columns[0], grid=[2,4,6,8], figure_title='Anno')

Radar view for each interface in one figure

In [ ]:
# take the data: all rows and up to the 23th columns (selecting precisely in case Total columns are added)
all_interfaces_counts = level_0.iloc[:12, :23].T.values

# same with percentages
level_0_percent = base.groupby(level=0).sum(axis=1).apply(lambda x: 100*x/float(x.sum()))
In [ ]:
level_0_percent.head()
In [ ]:
# checking we have 100 everywhere
level_0_percent.sum()
In [ ]:
level_0_percent.max().max()
In [ ]:
all_interfaces_percents = level_0_percent.iloc[:12, :23].T.values
In [ ]:
build_multiple_radar(labels, all_interfaces_percents, level_0.columns[:23], 'all-interfaces-single')

Global radar view (i.e. for all interfaces) over all features

In [ ]:
# take only the last column: Total per feature family, in percent
values = level_0.iloc[:,24:].T.values
In [ ]:
build_single_radar(labels, values, "All interfaces",grid=[5,10,15,20],figure_title='all-interfaces-global')

Metadata (global)

In [8]:
metadata = base.loc['newspaper metadata'].copy()
metadata.loc[:,'Total'] = metadata.sum(axis=1)
metadata['Total (%)'] = metadata['Total']/metadata['Total'].sum() * 100
metadata.loc[:,'Total']
Out[8]:
Alternative titles, succeeding titles, related titles    14.0
Place of publication                                     19.0
Geographic coverage                                      10.0
Publisher                                                13.0
Date range                                               23.0
Frequency (i.e. periodicity)                             10.0
ISSN, OCLC, LCCN                                          3.0
External links                                            5.0
Description of newspaper (historical)                    13.0
Language                                                  9.0
Calendar view of issues                                  11.0
Indication of archive holder                             11.0
Name: Total, dtype: float64
In [9]:
values_metadata = metadata.iloc[:,25:].T.values
build_single_radar_free(metadata.index, values_metadata, title="Newspaper metadata", figure_title="metadata-global")

Browsing

In [10]:
browsing = base.loc['browsing'].copy()
browsing.loc[:,'Total'] = browsing.sum(axis=1)
browsing['Total (%)'] = browsing['Total']/browsing['Total'].sum() * 100
browsing.loc[:,'Total']
Out[10]:
By date                             14.0
By title                            19.0
By place of publication             12.0
By user tag                          2.0
By newspaper thematic (metadata)     5.0
Name: Total, dtype: float64
In [11]:
values_browsing = browsing.iloc[:,25:].T.values
build_single_radar_free(browsing.index, values_browsing, title="Browsing", figure_title="browsing-global")
In [12]:
search = base.loc['search'].copy()
search.loc[:,'Total'] = search.sum(axis=1)
search['Total (%)'] = search['Total']/search['Total'].sum() * 100
search['Total']
Out[12]:
Keyword search                          24.0
Query autocomplete                       5.0
Boolean operators                       21.0
Phrase search                           14.0
Fuzzy search                             7.0
Wild card                                7.0
Proximity search                         5.0
Limit by date range                     22.0
Limit by language                        5.0
Limit by NP title(s)                    16.0
Limit by place of publication           12.0
Limit by NP thematic (from metadata)     4.0
Limit by NP segments / zones            11.0
Limit by article category                2.0
Limit by article length                  1.0
Limit by archival holder                 4.0
Limit by license/accessibility           4.0
Query suggestion                         1.0
Search by NE                             1.0
Name: Total, dtype: float64
In [13]:
values_search = search.iloc[:,25:].T.values
build_single_radar_free(search.index, values_search, title="Search", figure_title="search-global")

Result display

In [14]:
rd = base.loc['result display'].copy()
rd.loc[:,'Total'] = rd.sum(axis=1)
rd['Total (%)'] = rd['Total']/rd['Total'].sum() * 100
rd['Total']
Out[14]:
Distribution over time                      8.0
Distribution by publication place           1.0
Distribution by NP                          5.0
Distribution by place names in articles     1.0
Snippet preview                            20.0
Highlight of searches in facsimiles        19.0
Highlight of searches in OCRised text       9.0
Ngrams                                      3.0
Name: Total, dtype: float64
In [15]:
values_rd = rd.iloc[:,25:].T.values
build_single_radar_free(rd.index, values_rd, title="Result display", figure_title="result-display-global")

Result filtering

In [16]:
rf = base.loc['result filtering'].copy()
rf.loc[:,'Total'] = rf.sum(axis=1)
rf['Total (%)'] = rf['Total']/rf['Total'].sum() * 100
rf['Total']
Out[16]:
By NP titles                                             16.0
By publishing frequency                                   2.0
By political, religious, ... orientation of newspaper     1.0
By newspaper thematic (metadata)                          6.0
By content types                                          7.0
By sections/rubriques                                     3.0
By events                                                 1.0
By persons                                                3.0
By organisations                                          4.0
By places mentioned in text                               3.0
By time period                                           20.0
By topics                                                 1.0
By manual tags                                            2.0
By publication place                                     14.0
By archive                                                4.0
By publisher                                              5.0
By article length                                         4.0
By authors                                                4.0
By segmentation level                                     1.0
By language                                               6.0
By license                                                3.0
By online publication date                                3.0
Name: Total, dtype: float64
In [17]:
values_rf = rf.iloc[:,25:].T.values
build_single_radar_free(rf.index, values_rf, title="Result filtering", figure_title="result-filtering-global")

Result sorting

In [18]:
rs = base.loc['result sorting'].copy()
rs.loc[:,'Total'] = rs.sum(axis=1)
rs['Total (%)'] = rs['Total']/rs['Total'].sum() * 100
rs['Total']
Out[18]:
By relevance                  21.0
By date                       21.0
By NP title                   11.0
By article title               4.0
By content type                4.0
By online publication date     5.0
By author                      2.0
By quality of text             1.0
By language                    0.0
By popularity                  1.0
Name: Total, dtype: float64
In [19]:
values_rs = rs.iloc[:,25:].T.values
build_single_radar_free(rs.index, values_rs, title="Result sorting", figure_title="result-sorting-global")

Viewer

In [20]:
viewer = base.loc['viewer'].copy()
viewer.loc[:,'Total'] = viewer.sum(axis=1)
viewer['Total (%)'] = viewer['Total']/viewer['Total'].sum() * 100
viewer.loc[:,'Total']
Out[20]:
Facsimile displayed                  24.0
Optional OCRed text display          18.0
Show full page                       17.0
Interactive mini-map                  5.0
Overview of available issues         10.0
Search in viewed page                 7.0
Option to continue to next page      24.0
Option to continue to next result     7.0
Name: Total, dtype: float64
In [21]:
values_viewer = viewer.iloc[:,25:].T.values
build_single_radar_free(viewer.index, values_viewer, title="Viewer", figure_title="viewer-global")

Info on digitization

In [22]:
info = base.loc['info on digitization'].copy()
info.loc[:,'Total'] = info.sum(axis=1)
info['Total (%)'] = info['Total']/info['Total'].sum() * 100
info.loc[:,'Total']
Out[22]:
OLR at article level                                                          0.0
OCR confidence scores                                                         0.0
OLR confidence scores                                                         0.0
Documentation of biases and shortcomings with regard to method and corpus?    0.0
Search result relevance score                                                 0.0
Digitisation date at title level                                              0.0
Scan resolution (in dpi)                                                      0.0
Information on used OCR tools                                                 0.0
Copyright notice                                                              0.0
Documentation of scan methods                                                 0.0
Name: Total, dtype: float64

User interaction

In [23]:
user = base.loc['user interaction'].copy()
user.loc[:,'Total'] = user.sum(axis=1)
user['Total (%)'] = user['Total']/user['Total'].sum() * 100
user['Total']
Out[23]:
Save articles to favorites                     0.0
Save queries to favorites                      0.0
Tag articles                                   0.0
Keep track of viewed materials                 0.0
Article recommendations                        0.0
Permalinks                                     0.0
Export citation                                0.0
Option to correct OCR                          0.0
Option to correct OLR                          0.0
Users can add/edit of metadata                 0.0
Screenshot tool                                0.0
Bulk downloads                                 0.0
Organise articles in collections/favorites     0.0
Contrastive view of personal collections       0.0
Name: Total, dtype: float64

Enrichment

In [24]:
enrich = base.loc['enrichment'].copy()
enrich.loc[:,'Total'] = enrich.sum(axis=1)
enrich['Total (%)'] = enrich['Total']/enrich['Total'].sum() * 100
enrich['Total']
Out[24]:
NERC                                                    2.0
Entity linking                                          2.0
Automatic post-OCR correction                           2.0
Crowd-sourced post-OCR correction                       4.0
Topic Modelling                                         1.0
Text re-use                                             0.0
Sentiment Analysis                                      0.0
Query                                                   0.0
Recommendations (of similar and/or relevant content)    1.0
Event detection                                         1.0
Name: Total, dtype: float64
In [25]:
values_enrich = enrich.iloc[:,25:].T.values
build_single_radar_free(enrich.index, values_enrich, title="Enrichment", figure_title="enrichment-global")

Enrichment

In [24]:
enrich = base.loc['enrichment'].copy()
enrich.loc[:,'Total'] = enrich.sum(axis=1)
enrich['Total (%)'] = enrich['Total']/enrich['Total'].sum() * 100
enrich['Total']
Out[24]:
NERC                                                    2.0
Entity linking                                          2.0
Automatic post-OCR correction                           2.0
Crowd-sourced post-OCR correction                       4.0
Topic Modelling                                         1.0
Text re-use                                             0.0
Sentiment Analysis                                      0.0
Query                                                   0.0
Recommendations (of similar and/or relevant content)    1.0
Event detection                                         1.0
Name: Total, dtype: float64
In [25]:
values_enrich = enrich.iloc[:,25:].T.values
build_single_radar_free(enrich.index, values_enrich, title="Enrichment", figure_title="enrichment-global")

APIs

In [26]:
apis = base.loc['apis'].copy()
apis.loc[:,'Total'] = apis.sum(axis=1)
apis['Total (%)'] = apis['Total']/conn['Total'].sum() * 100
apis['Total']
Out[26]:
Link to source code of the interface    0.0
API                                     5.0
IIIF Image API                          7.0
IIIF Presentation API                   0.0
Name: Total, dtype: float64
In [28]:
values_apis = apis.iloc[:,25:].T.values
build_single_radar_free(apis.index, values_apis, title="APIs", figure_title="apis-global")
In [33]:
base = pd.read_csv("../data/InterfaceReview-June2019.tsv", sep="\t", index_col= [0,1], skipinitialspace=True)
In [34]:
pages = base.loc['newspaper collection','Number of pages'].copy()
In [49]:
pages
Out[49]:
Austrian Newspapers Online (ANNO)                   21,000,000
Ancestry                                           492,000,000
British Newspaper Archives                          32,000,000
California Digital Newspaper Collection (CDNC)       4,410,100
Chronicling America                                 14,949,638
Colorado Historical Newspaper Collection (CHNC)      1,400,000
Delpher                                             11,000,000
DigiPress                                            6,544,988
DIFMOE                                                       u
E-luxemburgensia                                             u
E-newspaperarchives                                  3,849,029
Europeana Newspapers                                         u
L'Express                                                    u
Gallica                                                      u
Georgia Historic Newspapers                                  u
Libraria - Ukrainian online periodicals archive        411,243
New York Times                                               u
POLONA                                                       u
Retronews                                                    u
Scriptorium                                                  u
StaBi                                                        u
Tessmann                                                     u
Le Temps archives                                    1,000,000
Trove                                               24,000,000
Name: (newspaper collection, Number of pages), dtype: object
In [50]:
#objects = pages.index
#y_pos = np.arange(len(objects))
#performance = pages.values

#plt.bar(y_pos, performance, align='center', alpha=0.5, width=0.5)
#plt.xticks(y_pos, objects)
#plt.ylabel('Usage')
#plt.title('Number of pages')

#plt.show()
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
 
In [ ]:
"newspaper metadata", 
 "apis",
 "connectivity",
 "info on digitization",
 "enrichment",
 "user interaction",
 "viewer",
 "result display",
 "result filtering",
 "result sorting",
 "search",
 "browsing"
In [ ]:
 
In [ ]:
 
In [ ]:
level1 = base.droplevel(0)
In [ ]:
level1.loc[:,'Total'] = level1.sum(axis=0)
In [ ]:
level1