Skip to content

Commit 277db2e

Browse files
committed
dep: change java html dep to neko-htmlunit
and update the implementation and the tests
1 parent a00e0d4 commit 277db2e

18 files changed

+49
-36
lines changed

ext/java/nokogiri/Html4ElementDescription.java

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
import java.util.List;
77
import java.util.Map;
88

9-
import org.cyberneko.html.HTMLElements;
9+
import net.sourceforge.htmlunit.cyberneko.HTMLElements;
1010
import org.jruby.Ruby;
1111
import org.jruby.RubyClass;
1212
import org.jruby.RubyObject;
@@ -24,6 +24,7 @@
2424
public class Html4ElementDescription extends RubyObject
2525
{
2626
private static final long serialVersionUID = 1L;
27+
private static final HTMLElements htmlElements_ = new HTMLElements();
2728

2829
/**
2930
* Stores memoized hash of element -> list of valid subelements.
@@ -63,9 +64,8 @@ public class Html4ElementDescription extends RubyObject
6364
* the list of elements directly because it's protected.
6465
*/
6566
for (short c = 0; c < HTMLElements.UNKNOWN; c++) {
66-
HTMLElements.Element maybe_sub =
67-
HTMLElements.getElement(c);
68-
if (maybe_sub.isParent(elem)) {
67+
HTMLElements.Element maybe_sub = htmlElements_.getElement(c);
68+
if (maybe_sub != null && maybe_sub.isParent(elem)) {
6969
subs.add(maybe_sub.name);
7070
}
7171
}
@@ -82,11 +82,10 @@ public class Html4ElementDescription extends RubyObject
8282
IRubyObject klazz, IRubyObject name)
8383
{
8484

85-
// nekohtml will return an element even for invalid names, see
86-
// http://sourceforge.net/p/nekohtml/code/HEAD/tree/trunk/src/org/cyberneko/html/HTMLElements.java#l514
87-
// which breaks `test_fetch_nonexistent'
88-
HTMLElements.Element elem = HTMLElements.getElement(name.asJavaString(), HTMLElements.NO_SUCH_ELEMENT);
89-
if (elem == HTMLElements.NO_SUCH_ELEMENT) {
85+
// nekohtml will return an element even for invalid names, which breaks `test_fetch_nonexistent'
86+
// see getElement() in HTMLElements.java
87+
HTMLElements.Element elem = htmlElements_.getElement(name.asJavaString(), htmlElements_.NO_SUCH_ELEMENT);
88+
if (elem == htmlElements_.NO_SUCH_ELEMENT) {
9089
return context.nil;
9190
}
9291

ext/java/nokogiri/Html4EntityLookup.java

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import static org.jruby.runtime.Helpers.invoke;
44

5-
import org.cyberneko.html.HTMLEntities;
5+
import net.sourceforge.htmlunit.cyberneko.HTMLEntitiesParser;
66
import org.jruby.Ruby;
77
import org.jruby.RubyClass;
88
import org.jruby.RubyObject;
@@ -38,8 +38,18 @@ public class Html4EntityLookup extends RubyObject
3838
{
3939
Ruby ruby = context.getRuntime();
4040
String name = key.toString();
41-
int val = HTMLEntities.get(name);
42-
if (val == -1) { return ruby.getNil(); }
41+
42+
HTMLEntitiesParser parser = new HTMLEntitiesParser();
43+
for (int j = 0 ; j < name.length() ; j++) {
44+
if (!parser.parse(name.charAt(j))) {
45+
break;
46+
}
47+
}
48+
String match = parser.getMatch();
49+
50+
if (match == null) { return ruby.getNil(); }
51+
52+
int val = match.charAt(0);
4353

4454
IRubyObject edClass =
4555
ruby.getClassFromPath("Nokogiri::HTML4::EntityDescription");

ext/java/nokogiri/Html4SaxParserContext.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import java.util.regex.Pattern;
1010

1111
import org.apache.xerces.parsers.AbstractSAXParser;
12-
import org.cyberneko.html.parsers.SAXParser;
12+
import net.sourceforge.htmlunit.cyberneko.parsers.SAXParser;
1313
import org.jruby.Ruby;
1414
import org.jruby.RubyClass;
1515
import org.jruby.RubyFixnum;

ext/java/nokogiri/NokogiriService.java

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ public class NokogiriService implements BasicLibraryService
8686
RubyModule htmlSaxModule = htmlModule.defineModuleUnder("SAX");
8787
RubyModule xsltModule = nokogiri.defineModuleUnder("XSLT");
8888

89-
createJavaLibraryVersionConstants(ruby, nokogiri);
9089
createNokogiriModule(ruby, nokogiri);
9190
createSyntaxErrors(ruby, nokogiri, xmlModule);
9291
RubyClass xmlNode = createXmlModule(ruby, xmlModule);
@@ -97,12 +96,6 @@ public class NokogiriService implements BasicLibraryService
9796
nokogiri.setInternalVariable("cache", populateNokogiriClassCahce(ruby));
9897
}
9998

100-
private void
101-
createJavaLibraryVersionConstants(Ruby ruby, RubyModule nokogiri)
102-
{
103-
nokogiri.defineConstant("NEKO_VERSION", ruby.newString(org.cyberneko.html.Version.getVersion()));
104-
}
105-
10699
private void
107100
createNokogiriModule(Ruby ruby, RubyModule nokogiri)
108101
{

ext/java/nokogiri/internals/HtmlDomParserContext.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@
1515
import org.apache.xerces.xni.XNIException;
1616
import org.apache.xerces.xni.parser.XMLDocumentFilter;
1717
import org.apache.xerces.xni.parser.XMLParserConfiguration;
18-
import org.cyberneko.html.HTMLConfiguration;
19-
import org.cyberneko.html.filters.DefaultFilter;
18+
import net.sourceforge.htmlunit.cyberneko.HTMLConfiguration;
19+
import net.sourceforge.htmlunit.cyberneko.filters.DefaultFilter;
2020
import org.jruby.Ruby;
2121
import org.jruby.RubyClass;
2222
import org.jruby.runtime.ThreadContext;

ext/java/nokogiri/internals/SaveContextVisitor.java

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
import java.util.regex.Matcher;
1313
import java.util.regex.Pattern;
1414

15-
import org.cyberneko.html.HTMLElements;
15+
import net.sourceforge.htmlunit.cyberneko.HTMLElements;
1616
import org.w3c.dom.Attr;
1717
import org.w3c.dom.CDATASection;
1818
import org.w3c.dom.Comment;
@@ -81,6 +81,8 @@ public class SaveContextVisitor
8181
public static final int SUBSETS = 8;
8282
public static final int EXCLUSIVE = 16;
8383

84+
private static final HTMLElements htmlElements_ = new HTMLElements();
85+
8486
public
8587
SaveContextVisitor(int options, CharSequence indent, String encoding, boolean htmlDoc, boolean fragment,
8688
int canonicalOpts)
@@ -498,7 +500,7 @@ public class SaveContextVisitor
498500
private boolean
499501
isEmpty(String name)
500502
{
501-
HTMLElements.Element element = HTMLElements.getElement(name);
503+
HTMLElements.Element element = htmlElements_.getElement(name);
502504
return element.isEmpty();
503505
}
504506

lib/nokogiri/jruby/dependencies.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
# should skip loading xml jars. This is because those are in WEB-INF/lib and
1010
# already set in the classpath.
1111
unless $LOAD_PATH.to_s.include?("appengine-rack")
12-
require "nekohtml.jar"
1312
require "nekodtd.jar"
1413
end
1514

lib/nokogiri/jruby/nekohtml.jar

-124 KB
Binary file not shown.

lib/nokogiri/jruby/nokogiri_jars.rb

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
require 'net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar'
99
require 'xml-apis/xml-apis/1.4.01/xml-apis-1.4.01.jar'
1010
require 'xalan/serializer/2.7.2/serializer-2.7.2.jar'
11+
require 'net/sourceforge/htmlunit/neko-htmlunit/2.61.0/neko-htmlunit-2.61.0.jar'
1112
require 'isorelax/isorelax/20030108/isorelax-20030108.jar'
1213
end
1314

@@ -18,11 +19,13 @@
1819
require_jar 'net.sf.saxon', 'Saxon-HE', '9.6.0-4'
1920
require_jar 'xml-apis', 'xml-apis', '1.4.01'
2021
require_jar 'xalan', 'serializer', '2.7.2'
22+
require_jar 'net.sourceforge.htmlunit', 'neko-htmlunit', '2.61.0'
2123
require_jar 'isorelax', 'isorelax', '20030108'
2224
end
2325

2426
# generated by the :vendor_jars rake task
2527
module Nokogiri
26-
JAR_DEPENDENCIES = {"isorelax"=>"isorelax:isorelax:20030108", "jing"=>"nu.validator:jing:20200702VNU", "serializer"=>"xalan:serializer:2.7.2", "xalan"=>"xalan:xalan:2.7.2", "xercesImpl"=>"xerces:xercesImpl:2.12.2", "xml-apis"=>"xml-apis:xml-apis:1.4.01"}.freeze
28+
JAR_DEPENDENCIES = {"isorelax"=>"isorelax:isorelax:20030108", "neko-htmlunit"=>"net.sourceforge.htmlunit:neko-htmlunit:2.61.0", "jing"=>"nu.validator:jing:20200702VNU", "serializer"=>"xalan:serializer:2.7.2", "xalan"=>"xalan:xalan:2.7.2", "xercesImpl"=>"xerces:xercesImpl:2.12.2", "xml-apis"=>"xml-apis:xml-apis:1.4.01"}.freeze
2729
XERCES_VERSION = JAR_DEPENDENCIES["xercesImpl"]
30+
NEKO_VERSION = JAR_DEPENDENCIES["neko-htmlunit"]
2831
end

lib/nokogiri/version/info.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,6 @@ def to_hash
169169
vi["other_libraries"] = Hash[*Nokogiri::OTHER_LIBRARY_VERSIONS.split(/[,:]/)]
170170
elsif jruby?
171171
vi["other_libraries"] = {}.tap do |ol|
172-
ol["nekohtml"] = Nokogiri::NEKO_VERSION
173172
Nokogiri::JAR_DEPENDENCIES.each do |k, v|
174173
ol[k] = v
175174
end

nokogiri.gemspec

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,8 @@ Gem::Specification.new do |spec|
260260
"lib/nokogiri/jruby/dependencies.rb",
261261
"lib/nokogiri/jruby/isorelax/isorelax/20030108/isorelax-20030108.jar",
262262
"lib/nokogiri/jruby/nekodtd.jar",
263-
"lib/nokogiri/jruby/nekohtml.jar",
264263
"lib/nokogiri/jruby/net/sf/saxon/Saxon-HE/9.6.0-4/Saxon-HE-9.6.0-4.jar",
264+
"lib/nokogiri/jruby/net/sourceforge/htmlunit/neko-htmlunit/2.61.0/neko-htmlunit-2.61.0.jar",
265265
"lib/nokogiri/jruby/nokogiri_jars.rb",
266266
"lib/nokogiri/jruby/nu/validator/jing/20200702VNU/jing-20200702VNU.jar",
267267
"lib/nokogiri/jruby/xalan/serializer/2.7.2/serializer-2.7.2.jar",
@@ -326,6 +326,7 @@ Gem::Specification.new do |spec|
326326
spec.requirements << "jar isorelax, isorelax, 20030108" # https://search.maven.org/artifact/isorelax/isorelax
327327
# spec.requirements << "jar nekohtml, nekodtd, 0.1.11" # FIXME, we should use Shahid's fork
328328
# spec.requirements << "jar nekohtml, nekohtml, 1.9.6.2" # FIXME, we should use Shahid's fork
329+
spec.requirements << "jar net.sourceforge.htmlunit, neko-htmlunit, 2.61.0"
329330
spec.requirements << "jar nu.validator, jing, 20200702VNU" # https://search.maven.org/artifact/nu.validator/jing
330331
spec.requirements << "jar xalan, serializer, 2.7.2" # https://search.maven.org/artifact/xalan/serializer
331332
spec.requirements << "jar xalan, xalan, 2.7.2" # https://search.maven.org/artifact/xalan/xalan

rakelib/extensions.rake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,7 @@ if java?
396396
module Nokogiri
397397
JAR_DEPENDENCIES = #{jar_dependencies}.freeze
398398
XERCES_VERSION = JAR_DEPENDENCIES["xercesImpl"]
399+
NEKO_VERSION = JAR_DEPENDENCIES["neko-htmlunit"]
399400
end
400401
EOF
401402
end

scripts/test-gem-file-contents

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -271,12 +271,12 @@ describe File.basename(gemfile) do
271271
it "contains the java jar files" do
272272
assert_includes(gemfile_contents, "lib/nokogiri/nokogiri.jar")
273273
assert_includes(gemfile_contents, "lib/nokogiri/jruby/nekodtd.jar")
274-
assert_includes(gemfile_contents, "lib/nokogiri/jruby/nekohtml.jar")
275274

276275
actual_jars = gemfile_contents.grep(/.*\.jar$/)
277276
expected_jars = [
278277
"isorelax",
279278
"jing",
279+
"neko-htmlunit",
280280
"serializer",
281281
"xalan",
282282
"xercesImpl",

test/html4/test_comments.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class TestComment < Nokogiri::TestCase
7575
it "behaves as if the comment is closed correctly" do # COMPLIANT
7676
assert_equal 1, subject.children.length
7777
assert_predicate subject.children.first, :comment?
78-
assert_equal "-", subject.children.first.content # curious, potentially non-compliant?
78+
assert_equal "", subject.children.first.content
7979
assert other_div
8080
end
8181
end

test/html4/test_element_description.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def test_subelements
6363
elsif Nokogiri.uses_libxml?
6464
assert_equal(61, sub_elements.length)
6565
else
66-
refute_empty(sub_elements)
66+
assert_equal(105, sub_elements.length)
6767
end
6868
end
6969

test/test_version.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def test_version_info_basics
3232
def test_version_info_for_xerces_and_nekohtml
3333
skip_unless_jruby("xerces/nekohtml is only used for JRuby")
3434
assert_equal(Nokogiri::XERCES_VERSION, version_info["other_libraries"]["xercesImpl"])
35-
assert_equal(Nokogiri::NEKO_VERSION, version_info["other_libraries"]["nekohtml"])
35+
assert_equal(Nokogiri::NEKO_VERSION, version_info["other_libraries"]["neko-htmlunit"])
3636
refute_nil(version_info["other_libraries"]["isorelax"])
3737
refute_nil(version_info["other_libraries"]["jing"])
3838
refute_nil(version_info["other_libraries"]["serializer"])

test/xml/test_node.rb

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,9 @@ def test_node_context_parsing_of_malformed_html_fragment
9595
context_node = doc.at_css("div")
9696
nodeset = context_node.parse("<div </div>")
9797

98-
assert_equal(1, doc.errors.length)
98+
pending_if("nekohtml commit 21286e4 not applied to neko-htmlunit", Nokogiri::VERSION_INFO["other_libraries"]["neko-htmlunit"]) do
99+
assert_equal(1, doc.errors.length)
100+
end
99101
assert_equal(1, nodeset.length)
100102
assert_equal("<div></div>", nodeset.to_s)
101103
assert_instance_of(Nokogiri::HTML4::Document, nodeset.document)
@@ -107,7 +109,9 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct
107109
context_node = doc.at_css("div")
108110
nodeset = context_node.parse("<div </div>", &:recover)
109111

110-
assert_equal(1, doc.errors.length)
112+
pending_if("nekohtml commit 21286e4 not applied to neko-htmlunit", Nokogiri::VERSION_INFO["other_libraries"]["neko-htmlunit"]) do
113+
assert_equal(1, doc.errors.length)
114+
end
111115
assert_equal(1, nodeset.length)
112116
assert_equal("<div></div>", nodeset.to_s)
113117
assert_instance_of(Nokogiri::HTML4::Document, nodeset.document)
@@ -117,8 +121,10 @@ def test_node_context_parsing_of_malformed_html_fragment_with_recover_is_correct
117121
def test_node_context_parsing_of_malformed_html_fragment_without_recover_is_not_corrected
118122
doc = HTML4.parse("<html><body><div></div></body></html>")
119123
context_node = doc.at_css("div")
120-
assert_raises(Nokogiri::XML::SyntaxError) do
121-
context_node.parse("<div </div>", &:strict)
124+
pending_if("nekohtml commit 21286e4 not applied to neko-htmlunit", Nokogiri::VERSION_INFO["other_libraries"]["neko-htmlunit"]) do
125+
assert_raises(Nokogiri::XML::SyntaxError) do
126+
context_node.parse("<div </div>", &:strict)
127+
end
122128
end
123129
end
124130

0 commit comments

Comments
 (0)