forked from zbw/pm20_bin
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcheck_film_notation.pl
188 lines (163 loc) · 5.14 KB
/
check_film_notation.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
#!/bin/env perl
# nbt, 8.11.2019
# create lists of films from filmdata/*.json
use strict;
use warnings;
use utf8;
use Carp;
use Data::Dumper;
use JSON;
use Path::Tiny;
use Readonly;
use Scalar::Util qw(looks_like_number);
# filmdata publicly available now
my $filmdata_root = path('../data/filmdata');
my $klassdata_root = path('../data/klassdata');
##my $filmdata_root = $film_public_root;
my $img_file = $filmdata_root->child('img_count.json');
my $ip_hints =
path('../web/templates/fragments/ip_hints.de.md.frag')->slurp_utf8;
my %page = (
h => {
column_ids => [
qw/ film_id start_sig start_date end_sig end_date img_count online comment /
],
list => {
'h1_sh' => {
title => 'Länder-Sacharchiv 1. Verfilmung',
},
'h2_sh' => {
title => 'Länder-Sacharchiv 2. Verfilmung',
},
'h1_co' => {
title => 'Firmen- und Institutionenarchiv 1. Verfilmung',
},
'h2_co' => {
title => 'Firmen- und Institutionenarchiv 2. Verfilmung',
},
'h1_wa' => {
title => 'Warenarchiv 1. Verfilmung',
},
'h2_wa' => {
title => 'Warenarchiv 2. Verfilmung',
},
},
},
k => {
column_ids => [
qw/ film_id img_id country geo_sig topic_sig from to no_material comment /
],
list => {
k1_sh => {
title => 'Sacharchiv 1. Verfilmung',
},
k2_sh => {
title => 'Sacharchiv 2. Verfilmung',
},
},
},
);
# notation regex
# (this is a variation of the notation regex in check_ifis_notation.pl)
my %nta_regex = (
ge => {
title => 'Historische Länderklassifikation',
pattern => qr/ ^ [A-Z] # Continent
( \d{0,3} # optional numerical code for country
[a-z]? # optional extension of country code
( ( # optional subdivision in brackets
( \(\d\d?\) ) # either numerical
| \((alt|Wn|Bln)\)# or special codes (old|Wien|Berlin)
) ){0,1}
)? $ /x,
lookup => decode_json( $klassdata_root->child('ag_lookup.json')->slurp ),
},
sh => {
title => 'Alte Hamburger Systematik',
pattern => qr/ ^
[A-Z] | # ignore for now
[a-z] # main class
( \s \d\d # optional subclass
[a-z]? # optional subclass extension
){0,1}
( # optional special folder
\s SM \s .+
){0,1} $ /x,
lookup => decode_json( $klassdata_root->child('je_lookup.json')->slurp ),
},
);
my %sequence = (
sh => [qw/ ge sh /],
wa => [qw/ wa ge /],
co => [qw/ ge co /],
);
## TODO extend to Kiel
##foreach my $prov ( keys %page ) {
foreach my $prov ('h') {
## TODO include companies and wares
## foreach my $page_name ( sort keys %{ $page{$prov}{list} } ) {
foreach my $page_name ( 'h1_sh', 'h2_sh', 'h1_co', 'h2_co' ) {
print "$page_name\n";
my $coll = substr( $page_name, 3, 2 );
my $set = substr( $page_name, 0, 2 );
# read json input
my @film_sections =
@{ decode_json( $filmdata_root->child( $page_name . '.json' )->slurp ) };
# iterate through the list of film sections (from the excel file)
foreach my $film_section (@film_sections) {
##print Dumper $film_section;
foreach my $sig ( 'start_sig', 'end_sig' ) {
next unless $film_section->{$sig};
# skip if special signature indicates empty film
next if $film_section->{$sig} eq 'x';
# remove the text part, reduce to notation
my $nta;
if ( $film_section->{$sig} =~ m/^.+? \[(.+)\]$/ ) {
$nta = $1;
} else {
$nta = $film_section->{$sig};
}
warn( " Missing signature in ", Dumper $film_section) unless $nta;
# split notation at the first blank (second part may have been omitted)
my @nta_parts = $nta =~ m/^(\S+)(?:\s(.+))?$/;
# check the parts of the notation
for ( my $i = 0 ; $i < scalar(@nta_parts) ; $i++ ) {
my $nta_type = $sequence{$coll}->[$i];
my $nta_part = $nta_parts[$i] || '';
# skip empty notation parts
next if $nta_part eq '';
# TODO check also sh and co
next unless $nta_type eq 'ge';
check_nta( $film_section, $sig, $nta_type, $nta_part );
##print Dumper $nta, $nta_type, $nta_part, $film_section; exit;
}
}
}
}
}
#######################
sub check_nta {
my $film_section = shift or die "Param missing";
my $sig = shift or die "Param missing";
my $nta_type = shift or die "Param missing";
my $nta_part = shift or confess "Param missing";
# check the notation formally
if ( not $nta_part =~ m/$nta_regex{$nta_type}{pattern}/x ) {
warn sprintf(
" %-8s: [%s] %s - format error (%s)",
$film_section->{film_id},
$nta_part, $nta_type, $sig
),
"\n";
return;
}
# check if the notation is known
if ( not exists $nta_regex{$nta_type}{lookup}->{$nta_part} ) {
warn sprintf(
" %-8s: [%s] %s - not found (%s)",
$film_section->{film_id},
$nta_part, $nta_type, $sig
),
"\n";
}
}