arnaud-m
diff --git a/‎.gitignore
Lines changed: 3 additions & 0 deletions b/‎.gitignore
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/main/benchmarks/README.org
Lines changed: 161 additions & 15 deletions b/‎src/main/benchmarks/README.org
Lines changed: 161 additions & 15 deletions
diff --git a/‎src/main/benchmarks/instances-solve/long-cryptarithms-10.db.txt
Lines changed: 0 additions & 20 deletions b/‎src/main/benchmarks/instances-solve/long-cryptarithms-10.db.txt
Lines changed: 0 additions & 20 deletions
diff --git a/‎src/main/benchmarks/instances-solve/short-cryptarithms-10.db.txt
Lines changed: 0 additions & 82 deletions b/‎src/main/benchmarks/instances-solve/short-cryptarithms-10.db.txt
Lines changed: 0 additions & 82 deletions
diff --git a/‎src/main/benchmarks/instances-solve/long-cryptarithms-ext-15.db.txt renamed to ‎src/main/benchmarks/other-bases/external-contributions-base-15.db.txt b/‎src/main/benchmarks/instances-solve/long-cryptarithms-ext-15.db.txt renamed to ‎src/main/benchmarks/other-bases/external-contributions-base-15.db.txt
diff --git a/‎src/main/benchmarks/instances-solve/long-cryptarithms-ext-16.db.txt renamed to ‎src/main/benchmarks/other-bases/external-contributions-base-16.db.txt b/‎src/main/benchmarks/instances-solve/long-cryptarithms-ext-16.db.txt renamed to ‎src/main/benchmarks/other-bases/external-contributions-base-16.db.txt
diff --git a/‎src/main/cryptarithms/README.org
Lines changed: 2 additions & 0 deletions b/‎src/main/cryptarithms/README.org
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/main/cryptarithms/de-1-200.db.txt renamed to ‎src/main/cryptarithms/de-1-500.db.txt b/‎src/main/cryptarithms/de-1-200.db.txt renamed to ‎src/main/cryptarithms/de-1-500.db.txt
diff --git a/‎src/main/cryptarithms/en-1-200.db.txt renamed to ‎src/main/cryptarithms/en-1-500.db.txt b/‎src/main/cryptarithms/en-1-200.db.txt renamed to ‎src/main/cryptarithms/en-1-500.db.txt
@@ -6,8 +6,11 @@ src/main/benchmarks/results*/
 src/main/benchmarks/results*.tar.gz
 src/main/benchmarks/solver.sh
 src/main/benchmarks/words
+src/main/benchmarks/samples/
 src/main/benchmarks/oar-stderr.log
 src/main/benchmarks/nohup.out
+src/main/benchmarks/crypt/
+src/main/benchmarks/crypt.awk 
 src/main/benchmarks/classify-additions.awk 
 
 #### joe made this: http://goel.io/joe
 
@@ -108,6 +108,19 @@ exit $?
 
 ** Configure instances
 
+*** Enumerate Doubly True
+
+#+BEGIN_SRC sh
+  generateDoublyTrue 401 500
+ #+END_SRC
+
+
+Interrupted processes
+  - pt-301-400
+  - el-301-400
+  - ro-301-400
+  - el-401-500
+
 *** Enumerate all cryptarithms
 
 The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (around 110K candidates).
@@ -120,6 +133,7 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
   done
  #+END_SRC
 
+
 *** Enumerate the longest cryptarithms
 
   I have found manually the longest cryptarithms of the remaining word lists.
@@ -138,16 +152,79 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
    generateLong 16 capitales.txt
  #+END_SRC
 
+
+
+*** Enumerate samples
+
+ #+BEGIN_SRC sh
+   rm -fr samples
+   mkdir samples
+   (
+       cd samples
+       pwd
+       ../../R/sample-words.R -n 5 -w 80 --minlen 2 --maxlen 7  ../../words/liste.de.mots.francais.frgut.txt
+   )
+ #+END_SRC
+
+ #+BEGIN_SRC sh
+   function generateMaxLong() {
+       find samples/ -type f -print | while read PATHNAME ; do
+           INSTANCE="instances/"`basename -s .txt $PATHNAME`-$1-$2.dat
+           echo -minop $1 -maxop $2 $PATHNAME > $INSTANCE
+        done
+   }
+
+   min=35
+   step=2
+   max=46
+   while [ $min -lt $max ] ; do
+       generateMaxLong $min $((min+step-1))
+       min=$((min+step))
+   done
+ #+END_SRC
+
 *** Generate languages
 
  #+BEGIN_SRC R :results output file :file "words-10-2.txt"
    base <- 10
    x <- head(letters, base)
-   xy <- expand.grid(x = x, y = x)
+   xy <- expand.grid(x = head(x, 2), y = x)
    cat(paste(x, '\n', collapse = ''))
    cat(paste(xy$x, xy$y, '\n', sep = "", collapse = ''))
 #+END_SRC
 
+ #+BEGIN_SRC R :results output file :file "words-10-2.txt"
+   base <- 10
+   x <- head(letters, base)
+   xy <-  expand.grid(x = x,  y = x)
+   xyz <- expand.grid(x = head(x, 1), y = tail(x, 2), z = x)
+   cat(paste(x, '\n', collapse = ''))
+   cat(paste(xy$x, xy$y, '\n', sep = "", collapse = ''))
+   cat(paste(xyz$x, xyz$y, xyz$z,'\n', sep = "", collapse = ''))
+#+END_SRC
+
+*** Crossword
+
+ #+BEGIN_SRC sh
+   rm -fr samples
+   mkdir samples
+   (
+       cd samples
+       pwd
+       ../../R/sample-words.R -n 60 -w 20 --minlen 2 --maxlen 3  ../../words/liste.de.mots.francais.frgut.txt
+   )
+ #+END_SRC
+
+ #+BEGIN_SRC sh
+   function generateCrossword() {
+       find samples/ -type f -print | while read PATHNAME ; do
+           INSTANCE="instances/"`basename -s .txt $PATHNAME`-$1.dat
+           echo -grid $1 $PATHNAME > $INSTANCE
+        done
+   }
+
+   generateCrossword 3
+ #+END_SRC
 
 ** Configure the algorithm
 
@@ -158,30 +235,60 @@ The word lists ~colors.txt~ and ~monsters.txt~ takes the most time by far (aroun
 
 * Solve
 ** Create the solver
+*** Compare Models
+
 
 #+BEGIN_SRC sh :tangle solver.sh
   JAR=cryptator-0.6.0-SNAPSHOT-with-dependencies.jar
   MAINCLASS=cryptator.Cryptator
-
   MAINARGS=`cat $1 | xargs`
-
   ## exclude cryptarithm with long words
-  ## grep -wvE '\w{10,}' $2 | sed 's/[[:space:]]//g'  | xargs java -server  -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
+  grep -wvE '\w{9,}' $2 | sed 's/[[:space:]]//g'  | xargs java -server  -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
+  exit $?
+ #+END_SRC
 
-  ## exclude cryptarithm without long words
-  ## grep -wE '\w{10,}' $2 | sed 's/[[:space:]]//g'  | xargs java -server  -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
+*** Compare Solvers
 
+#+BEGIN_SRC sh :tangle solver.sh
+  JAR=cryptator-0.6.0-SNAPSHOT-with-dependencies.jar
+  MAINCLASS=cryptator.Cryptator
+  MAINARGS=`cat $1 | xargs`
   ## Do not exclude anything
   sed 's/[[:space:]]//g'  $2 | xargs java -server  -Xms512m -Xmx8192m -cp $JAR $MAINCLASS $MAINARGS
+
   exit $?
  #+END_SRC
 
-** Configure instances
+**** Crypt
 
-Select instance files in the directory ~instances-solve~.
+The very fast [[https://tamura70.gitlab.io/web-puzzle/cryptarithm/][crypt solver]] in C of Naoyuki Tamura.
 
- - Filename prefixed by ~long~ requires the bignum model.
- - Filename prefixed by ~short~ are accepted by the scalar model.
+Postprocess the output of the crypt solver.
+#+BEGIN_SRC awk :tangle crypt.awk
+  #!/usr/bin/awk
+  {
+      if(NR % 3 == 2 && $_ !~ /Total time =/) {
+          printf "i%s\n",$_
+      } else if(NR % 3 == 1 && NR > 1) {
+          print "d NBSOLS",$1
+          print "d TIME",$3/1000
+      } else if(NR % 3 == 0) {
+          printf "s%s\n",$_
+      } else print $_
+  }
+ #+END_SRC
+
+ Encapsulate the solver for benchmarking
+#+BEGIN_SRC sh :tangle solver.sh
+  #!/bin/sh
+  ./crypt/crypt < $2 | awk -f crypt.awk
+ #+END_SRC
+
+** Configure instances
+
+ #+BEGIN_SRC sh
+   ln -s ../cryptarithms instances
+ #+END_SRC
 
 ** Configure the algorithm
 
@@ -194,10 +301,10 @@ Select instance files in the directory ~instances-solve~.
    echo $ARGS -l TRUE -search 1 > algorithms/BIGNUM-1.dat
    echo $ARGS -l FALSE -h FALSE -search 1 > algorithms/SCALAR-1.dat
    echo $ARGS -l FALSE -h TRUE -search 1 > algorithms/HORNER-1.dat
-
  #+END_SRC
 
 
+
 * Testing
 
 Change the file extension accordingly.
@@ -259,14 +366,12 @@ Change the file extension accordingly.
 
 #+BEGIN_SRC sh
   INDIR=results-cryptarithms
-  OUTDIR=results-classified
-  rm -fr $OUTDIR
-  mkdir $OUTDIR
+  OUTDIR=../cryptarithms/samples
+
   ## Classify
   (
       cd $OUTDIR
       find ../$INDIR -name "*.db.txt" -exec awk -f ../classify-additions.awk {} \;
-
   )
   ## Sort cryptarithms
   find $OUTDIR -name '*.db.txt' -exec sort -u -o {} {} \;
@@ -275,4 +380,45 @@ Change the file extension accordingly.
 
 
 
+#+BEGIN_SRC sh
+  OUTDIR=../cryptarithms/samples
+  TEMPFILE=`mktemp`
+  wc -l $OUTDIR/* | head -n -1 | while read LINECOUNT DBFILE ; do
+      if [ $LINECOUNT -gt 1000 ] ; then
+      head -n 1000 $DBFILE > $TEMPFILE
+      mv $TEMPFILE $DBFILE
+      fi
+    done
+
+#+END_SRC
+
+
+
 * Analysis
+
+#+BEGIN_SRC R
+  dbfiles <- list.files(c("../cryptarithms"), pattern = "*.db.txt", recursive = TRUE, full.names=TRUE)
+  dbfiles <- subset(dbfiles, !grepl("external-contributions", dbfiles))
+  cryptarithms <- unlist(sapply(dbfiles, readLines))
+  cryptarithms <- subset(cryptarithms, nchar(cryptarithms) > 0)
+  operands <- strsplit(cryptarithms, "[ +=]+")
+
+  ExtractFeatures <- function(x) {
+    y <- nchar(x)
+    r <- range(y)
+    z <- length(unique(unlist(strsplit(x, ""))))
+    c( words = length(x), letters = sum(y), symbols = z, meanLen = mean(y), medianLen = median(y), minLen = r[1], maxLen = r[2], diffLen = r[2] - r[1])
+  }
+
+  features <- as.data.frame(t(sapply(operands, ExtractFeatures)))
+  cat(nrow(features), "cryptarithms in database\n")
+  summary(features)
+
+  library(dplyr)
+  library(ggplot2)
+  x <- features %>% count(words, maxLen)
+  x$n <- log10(x$n)
+  ggplot(x, aes(words, maxLen, fill= n)) +
+  geom_tile()
+
+#+END_SRC
@@ -4,3 +4,5 @@
 
 - Thousands of cryptarithms generated by our solver.
 - Each file contains a theme collection.
+- The directory ~samples~ contains cryptarithms generated using word lists sampled from the French dictionary. The filename indicates the number of summands prefixed by ~N~, the lengths of the shortest word, and the length of the longest word.
+- The file ~external-contributions.db.txt~ contains cryptarithms found over the internet.