Skip to content

Commit c848129

Browse files
authored
Merge pull request #112 from arnaud-m/database
Extend the cryptarithm database - #107 - #108 - #109
2 parents 5eeaa41 + d9769d5 commit c848129

File tree

121 files changed

+16235
-311
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+16235
-311
lines changed

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@ src/main/benchmarks/results*/
66
src/main/benchmarks/results*.tar.gz
77
src/main/benchmarks/solver.sh
88
src/main/benchmarks/words
9+
src/main/benchmarks/samples/
910
src/main/benchmarks/oar-stderr.log
1011
src/main/benchmarks/nohup.out
12+
src/main/benchmarks/crypt/
13+
src/main/benchmarks/crypt.awk
1114
src/main/benchmarks/classify-additions.awk
1215

1316
#### joe made this: http://goel.io/joe

src/main/benchmarks/README.org

Lines changed: 161 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,19 @@ exit $?
108108

109109
** Configure instances
110110

111+
*** Enumerate Doubly True
112+
113+
#+BEGIN_SRC sh
114+
generateDoublyTrue 401 500
115+
#+END_SRC
116+
117+
118+
Interrupted processes
119+
- pt-301-400
120+
- el-301-400
121+
- ro-301-400
122+
- el-401-500
123+
111124
*** Enumerate all cryptarithms
112125

113126
The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (around 110K candidates).
@@ -120,6 +133,7 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
120133
done
121134
#+END_SRC
122135

136+
123137
*** Enumerate the longest cryptarithms
124138

125139
I have found manually the longest cryptarithms of the remaining word lists.
@@ -138,16 +152,79 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
138152
generateLong 16 capitales.txt
139153
#+END_SRC
140154

155+
156+
157+
*** Enumerate samples
158+
159+
#+BEGIN_SRC sh
160+
rm -fr samples
161+
mkdir samples
162+
(
163+
cd samples
164+
pwd
165+
../../R/sample-words.R -n 5 -w 80 --minlen 2 --maxlen 7 ../../words/liste.de.mots.francais.frgut.txt
166+
)
167+
#+END_SRC
168+
169+
#+BEGIN_SRC sh
170+
function generateMaxLong() {
171+
find samples/ -type f -print | while read PATHNAME ; do
172+
INSTANCE="instances/"`basename -s .txt $PATHNAME`-$1-$2.dat
173+
echo -minop $1 -maxop $2 $PATHNAME > $INSTANCE
174+
done
175+
}
176+
177+
min=35
178+
step=2
179+
max=46
180+
while [ $min -lt $max ] ; do
181+
generateMaxLong $min $((min+step-1))
182+
min=$((min+step))
183+
done
184+
#+END_SRC
185+
141186
*** Generate languages
142187

143188
#+BEGIN_SRC R :results output file :file "words-10-2.txt"
144189
base <- 10
145190
x <- head(letters, base)
146-
xy <- expand.grid(x = x, y = x)
191+
xy <- expand.grid(x = head(x, 2), y = x)
147192
cat(paste(x, '\n', collapse = ''))
148193
cat(paste(xy$x, xy$y, '\n', sep = "", collapse = ''))
149194
#+END_SRC
150195

196+
#+BEGIN_SRC R :results output file :file "words-10-2.txt"
197+
base <- 10
198+
x <- head(letters, base)
199+
xy <- expand.grid(x = x, y = x)
200+
xyz <- expand.grid(x = head(x, 1), y = tail(x, 2), z = x)
201+
cat(paste(x, '\n', collapse = ''))
202+
cat(paste(xy$x, xy$y, '\n', sep = "", collapse = ''))
203+
cat(paste(xyz$x, xyz$y, xyz$z,'\n', sep = "", collapse = ''))
204+
#+END_SRC
205+
206+
*** Crossword
207+
208+
#+BEGIN_SRC sh
209+
rm -fr samples
210+
mkdir samples
211+
(
212+
cd samples
213+
pwd
214+
../../R/sample-words.R -n 60 -w 20 --minlen 2 --maxlen 3 ../../words/liste.de.mots.francais.frgut.txt
215+
)
216+
#+END_SRC
217+
218+
#+BEGIN_SRC sh
219+
function generateCrossword() {
220+
find samples/ -type f -print | while read PATHNAME ; do
221+
INSTANCE="instances/"`basename -s .txt $PATHNAME`-$1.dat
222+
echo -grid $1 $PATHNAME > $INSTANCE
223+
done
224+
}
225+
226+
generateCrossword 3
227+
#+END_SRC
151228

152229
** Configure the algorithm
153230

@@ -158,30 +235,60 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
158235

159236
* Solve
160237
** Create the solver
238+
*** Compare Models
239+
161240

162241
#+BEGIN_SRC sh :tangle solver.sh
163242
JAR=cryptator-0.6.0-SNAPSHOT-with-dependencies.jar
164243
MAINCLASS=cryptator.Cryptator
165-
166244
MAINARGS=`cat $1 | xargs`
167-
168245
## exclude cryptarithm with long words
169-
## grep -wvE '\w{10,}' $2 | sed 's/[[:space:]]//g' | xargs java -server -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
246+
grep -wvE '\w{9,}' $2 | sed 's/[[:space:]]//g' | xargs java -server -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
247+
exit $?
248+
#+END_SRC
170249

171-
## exclude cryptarithm without long words
172-
## grep -wE '\w{10,}' $2 | sed 's/[[:space:]]//g' | xargs java -server -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
250+
*** Compare Solvers
173251

252+
#+BEGIN_SRC sh :tangle solver.sh
253+
JAR=cryptator-0.6.0-SNAPSHOT-with-dependencies.jar
254+
MAINCLASS=cryptator.Cryptator
255+
MAINARGS=`cat $1 | xargs`
174256
## Do not exclude anything
175257
sed 's/[[:space:]]//g' $2 | xargs java -server -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
258+
176259
exit $?
177260
#+END_SRC
178261

179-
** Configure instances
262+
**** Crypt
180263

181-
Select instance files in the directory ~instances-solve~.
264+
The very fast [[https://tamura70.gitlab.io/web-puzzle/cryptarithm/][crypt solver]] in C of Naoyuki Tamura.
182265

183-
- Filename prefixed by ~long~ requires the bignum model.
184-
- Filename prefixed by ~short~ are accepted by the scalar model.
266+
Postprocess the output of the crypt solver.
267+
#+BEGIN_SRC awk :tangle crypt.awk
268+
#!/usr/bin/awk
269+
{
270+
if(NR % 3 == 2 && $_ !~ /Total time =/) {
271+
printf "i%s\n",$_
272+
} else if(NR % 3 == 1 && NR > 1) {
273+
print "d NBSOLS",$1
274+
print "d TIME",$3/1000
275+
} else if(NR % 3 == 0) {
276+
printf "s%s\n",$_
277+
} else print $_
278+
}
279+
#+END_SRC
280+
281+
Encapsulate the solver for benchmarking
282+
#+BEGIN_SRC sh :tangle solver.sh
283+
#!/bin/sh
284+
./crypt/crypt < $2 | awk -f crypt.awk
285+
#+END_SRC
286+
287+
** Configure instances
288+
289+
#+BEGIN_SRC sh
290+
ln -s ../cryptarithms instances
291+
#+END_SRC
185292

186293
** Configure the algorithm
187294

@@ -194,10 +301,10 @@ Select instance files in the directory ~instances-solve~.
194301
echo $ARGS -l TRUE -search 1 > algorithms/BIGNUM-1.dat
195302
echo $ARGS -l FALSE -h FALSE -search 1 > algorithms/SCALAR-1.dat
196303
echo $ARGS -l FALSE -h TRUE -search 1 > algorithms/HORNER-1.dat
197-
198304
#+END_SRC
199305

200306

307+
201308
* Testing
202309

203310
Change the file extension accordingly.
@@ -259,14 +366,12 @@ Change the file extension accordingly.
259366

260367
#+BEGIN_SRC sh
261368
INDIR=results-cryptarithms
262-
OUTDIR=results-classified
263-
rm -fr $OUTDIR
264-
mkdir $OUTDIR
369+
OUTDIR=../cryptarithms/samples
370+
265371
## Classify
266372
(
267373
cd $OUTDIR
268374
find ../$INDIR -name "*.db.txt" -exec awk -f ../classify-additions.awk {} \;
269-
270375
)
271376
## Sort cryptarithms
272377
find $OUTDIR -name '*.db.txt' -exec sort -u -o {} {} \;
@@ -275,4 +380,45 @@ Change the file extension accordingly.
275380

276381

277382

383+
#+BEGIN_SRC sh
384+
OUTDIR=../cryptarithms/samples
385+
TEMPFILE=`mktemp`
386+
wc -l $OUTDIR/* | head -n -1 | while read LINECOUNT DBFILE ; do
387+
if [ $LINECOUNT -gt 1000 ] ; then
388+
head -n 1000 $DBFILE > $TEMPFILE
389+
mv $TEMPFILE $DBFILE
390+
fi
391+
done
392+
393+
#+END_SRC
394+
395+
396+
278397
* Analysis
398+
399+
#+BEGIN_SRC R
400+
dbfiles <- list.files(c("../cryptarithms"), pattern = "*.db.txt", recursive = TRUE, full.names=TRUE)
401+
dbfiles <- subset(dbfiles, !grepl("external-contributions", dbfiles))
402+
cryptarithms <- unlist(sapply(dbfiles, readLines))
403+
cryptarithms <- subset(cryptarithms, nchar(cryptarithms) > 0)
404+
operands <- strsplit(cryptarithms, "[ +=]+")
405+
406+
ExtractFeatures <- function(x) {
407+
y <- nchar(x)
408+
r <- range(y)
409+
z <- length(unique(unlist(strsplit(x, ""))))
410+
c( words = length(x), letters = sum(y), symbols = z, meanLen = mean(y), medianLen = median(y), minLen = r[1], maxLen = r[2], diffLen = r[2] - r[1])
411+
}
412+
413+
features <- as.data.frame(t(sapply(operands, ExtractFeatures)))
414+
cat(nrow(features), "cryptarithms in database\n")
415+
summary(features)
416+
417+
library(dplyr)
418+
library(ggplot2)
419+
x <- features %>% count(words, maxLen)
420+
x$n <- log10(x$n)
421+
ggplot(x, aes(words, maxLen, fill= n)) +
422+
geom_tile()
423+
424+
#+END_SRC

src/main/benchmarks/instances-solve/long-cryptarithms-10.db.txt

Lines changed: 0 additions & 20 deletions
This file was deleted.

src/main/benchmarks/instances-solve/short-cryptarithms-10.db.txt

Lines changed: 0 additions & 82 deletions
This file was deleted.

src/main/cryptarithms/README.org

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,5 @@
44

55
- Thousands of cryptarithms generated by our solver.
66
- Each file contains a theme collection.
7+
- The directory ~samples~ contains cryptarithms generated using word lists sampled from the French dictionary. The filename indicates the number of summands prefixed by ~N~, the lengths of the shortest word, and the length of the longest word.
8+
- The file ~external-contributions.db.txt~ contains cryptarithms found over the internet.

0 commit comments

Comments
 (0)