Skip to content

Commit 00cdada

Browse files
committed
UBXFConverter: ignore footnote content
When running `createsrcloc` or `analyze`, ignore `\w` tags inside footnotes. They are used by UHB to denote alternative readings, and obviously should not be counted as source locations. While we are at it, make sure that AugmentGrammar's `dump` and `dumpwords` options output UTF-8.
1 parent 708fc2f commit 00cdada

File tree

2 files changed

+19
-14
lines changed

2 files changed

+19
-14
lines changed

biblemulticonverter/src/main/java/biblemulticonverter/format/AugmentGrammar.java

+4-2
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import java.io.FileOutputStream;
77
import java.io.FileWriter;
88
import java.io.IOException;
9+
import java.io.OutputStreamWriter;
10+
import java.nio.charset.StandardCharsets;
911
import java.util.ArrayList;
1012
import java.util.Arrays;
1113
import java.util.Comparator;
@@ -65,7 +67,7 @@ public void doExport(Bible bible, String... exportArgs) throws Exception {
6567
}
6668
if (exportArgs[0].equals("dump")) {
6769
boolean humanStrongs = exportArgs.length > 2 && exportArgs[2].equals("humanStrongs");
68-
try (BufferedWriter bw = new BufferedWriter(new FileWriter(exportArgs[1]))) {
70+
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(exportArgs[1]), StandardCharsets.UTF_8))) {
6971
runOperation(bible, new GrammarOperation() {
7072

7173
private int counter = 0;
@@ -95,7 +97,7 @@ public Visitor<RuntimeException> handleGrammar(Versification.Reference reference
9597
}
9698
} else if (exportArgs[0].equals("dumpwords")) {
9799
boolean humanStrongs = exportArgs.length > 2 && exportArgs[2].equals("humanStrongs");
98-
try (BufferedWriter bw = new BufferedWriter(new FileWriter(exportArgs[1]))) {
100+
try (BufferedWriter bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(exportArgs[1]), StandardCharsets.UTF_8))) {
99101
runOperation(bible, new GrammarOperation() {
100102

101103
private int counter = 0;

biblemulticonverter/src/main/java/biblemulticonverter/format/paratext/UBXFConverter.java

+15-12
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import biblemulticonverter.format.paratext.ParatextBook.ParatextID;
2121
import biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormatting;
2222
import biblemulticonverter.format.paratext.ParatextCharacterContent.AutoClosingFormattingKind;
23+
import biblemulticonverter.format.paratext.ParatextCharacterContent.FootnoteXref;
2324
import biblemulticonverter.format.paratext.ParatextCharacterContent.Milestone;
2425
import biblemulticonverter.format.paratext.ParatextCharacterContent.ParatextCharacterContentPart;
2526
import biblemulticonverter.format.paratext.ParatextCharacterContent.Reference;
@@ -74,7 +75,7 @@ public void doExportBooks(List<ParatextBook> books, String... exportArgs) throws
7475
if (exportArgs[0].equals("createsrcloc") && exportArgs[2].equals("--")) {
7576
final String prefix = exportArgs[1];
7677
for (ParatextBook book : books) {
77-
book.accept(new UBXFBookVisitor(book.getId(), new UBXFGrammarHandlerVisitor() {
78+
book.accept(new UBXFBookVisitor(book.getId(), false, new UBXFGrammarHandlerVisitor() {
7879
int index;
7980

8081
@Override
@@ -96,7 +97,7 @@ protected void handleWordlist(AutoClosingFormatting acf, Reference where) {
9697
Map<String, Integer> occurrences = new HashMap<>();
9798
Map<String, String> words = new HashMap<>();
9899
for (ParatextBook book : books) {
99-
book.accept(new UBXFBookVisitor(book.getId(), new UBXFGrammarHandlerVisitor() {
100+
book.accept(new UBXFBookVisitor(book.getId(), false, new UBXFGrammarHandlerVisitor() {
100101
private void extractContent(StringBuilder sb, ParatextCharacterContentContainer pccc) {
101102
for (ParatextCharacterContentPart part : pccc.getContent()) {
102103
if (part instanceof ParatextCharacterContentContainer) {
@@ -156,7 +157,7 @@ protected void handleWordlist(AutoClosingFormatting acf, Reference where) {
156157
props.load(fis);
157158
}
158159
for (ParatextBook book : books) {
159-
book.accept(new UBXFBookVisitor(book.getId(), new UBXFGrammarHandlerVisitor() {
160+
book.accept(new UBXFBookVisitor(book.getId(), true, new UBXFGrammarHandlerVisitor() {
160161
@Override
161162
protected void handleAlignMilestone(boolean start, Milestone milestone, Reference where) {
162163
if (!start)
@@ -187,7 +188,7 @@ protected void handleWordlist(AutoClosingFormatting acf, Reference where) {
187188
} else if ((exportArgs[0].equals("fillwordattr") || exportArgs[0].equals("createwordattr")) && exportArgs[1].equals("--")) {
188189
final boolean restructure = exportArgs[0].equals("createwordattr");
189190
for (ParatextBook book : books) {
190-
book.accept(new UBXFBookVisitor(book.getId(), new UBXFGrammarHandlerVisitor() {
191+
book.accept(new UBXFBookVisitor(book.getId(), true, new UBXFGrammarHandlerVisitor() {
191192
private void restructure(ParatextCharacterContentContainer pccc, boolean inWordlist, boolean inNewWordlist) {
192193
for (int i = 0; i < pccc.getContent().size(); i++) {
193194
ParatextCharacterContentPart part = pccc.getContent().get(i);
@@ -243,11 +244,11 @@ private void restructure(ParatextCharacterContentContainer pccc, boolean inWordl
243244
}
244245

245246
@Override
246-
protected void handleContent(ParatextCharacterContentContainer pccc) {
247+
protected void handleContent(ParatextCharacterContentContainer pccc, boolean enterFootnotes) {
247248
if (restructure) {
248249
restructure(pccc, false, false);
249250
}
250-
super.handleContent(pccc);
251+
super.handleContent(pccc, enterFootnotes);
251252
}
252253

253254
List<Milestone> openMilestones = new ArrayList<>();
@@ -289,7 +290,7 @@ protected void handleWordlist(AutoClosingFormatting acf, Reference where) {
289290
formatArg = 2;
290291
} else if (exportArgs[0].equals("convertgrammar") && exportArgs[1].equals("--")) {
291292
for (ParatextBook book : books) {
292-
book.accept(new UBXFBookVisitor(book.getId(), new UBXFGrammarHandlerVisitor() {
293+
book.accept(new UBXFBookVisitor(book.getId(), true, new UBXFGrammarHandlerVisitor() {
293294

294295
private String getHebPrefixStrong(char ch) {
295296
switch (ch) {
@@ -435,10 +436,12 @@ private static class UBXFBookVisitor implements ParatextBookContentVisitor<Runti
435436

436437
private final ParatextID bookID;
437438
private final UBXFGrammarHandlerVisitor ghv;
439+
private final boolean enterFootnotes;
438440
private int chapterNumber = -1;
439441

440-
public UBXFBookVisitor(ParatextID bookID, UBXFGrammarHandlerVisitor ghv) {
442+
public UBXFBookVisitor(ParatextID bookID, boolean enterFootnotes, UBXFGrammarHandlerVisitor ghv) {
441443
this.bookID = bookID;
444+
this.enterFootnotes = enterFootnotes;
442445
this.ghv = ghv;
443446
}
444447

@@ -490,7 +493,7 @@ public void visitFigure(String caption, Map<String, String> attributes) throws R
490493

491494
@Override
492495
public void visitParatextCharacterContent(ParatextCharacterContent content) throws RuntimeException {
493-
ghv.handleContent(content);
496+
ghv.handleContent(content, enterFootnotes);
494497
}
495498
}
496499

@@ -507,7 +510,7 @@ public void setWhere(Reference newWhere) {
507510
protected void handleAlignMilestone(boolean start, Milestone milestone, Reference where) {
508511
}
509512

510-
protected void handleContent(ParatextCharacterContentContainer pccc) {
513+
protected void handleContent(ParatextCharacterContentContainer pccc, boolean enterFootnotes) {
511514
for (ParatextCharacterContentPart part : pccc.getContent()) {
512515
if (part instanceof AutoClosingFormatting && ((AutoClosingFormatting) part).getKind() == AutoClosingFormattingKind.WORDLIST) {
513516
handleWordlist((AutoClosingFormatting) part, where);
@@ -519,8 +522,8 @@ protected void handleContent(ParatextCharacterContentContainer pccc) {
519522
handleAlignMilestone(false, milestone, where);
520523
}
521524
}
522-
if (part instanceof ParatextCharacterContentContainer) {
523-
handleContent((ParatextCharacterContentContainer) part);
525+
if (part instanceof ParatextCharacterContentContainer && (enterFootnotes || !(part instanceof FootnoteXref))) {
526+
handleContent((ParatextCharacterContentContainer) part, enterFootnotes);
524527
}
525528
}
526529
}

0 commit comments

Comments
 (0)