Skip to content
This repository was archived by the owner on Apr 24, 2020. It is now read-only.

Commit b0164ef

Browse files
committed
bug fix and new executable added for testing
fixed hash bug '( #)' in output where it should be '(# #)' added pfpc_token to take tokens instead sentence as input added testing for this fix
1 parent fd93108 commit b0164ef

File tree

4 files changed

+211
-2
lines changed

4 files changed

+211
-2
lines changed

CMakeLists.txt

+8
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,18 @@ ADD_EXECUTABLE(test
2525
src/test/lexicon.cpp
2626
src/test/pcfg_parser.cpp
2727
src/test/tokenizer.cpp
28+
src/test/pfp.cpp
2829
src/test/main.cpp
2930
)
3031

3132
ADD_EXECUTABLE(pfpc
3233
src/pfpc/main.cpp
3334
)
3435

36+
ADD_EXECUTABLE(pfpc_token
37+
src/pfpc/pfpc_token.cpp
38+
)
39+
3540
ADD_LIBRARY(pfp SHARED
3641
src/pfp/config
3742
src/pfp/tokenizer.yy
@@ -48,16 +53,19 @@ ADD_EXECUTABLE(pfpd
4853
IF(APPLE)
4954
TARGET_LINK_LIBRARIES(pfpd pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
5055
TARGET_LINK_LIBRARIES(pfpc pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
56+
TARGET_LINK_LIBRARIES(pfpc_token pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
5157
TARGET_LINK_LIBRARIES(test pfp boost_thread-mt boost_unit_test_framework-mt boost_regex-mt icuio)
5258
TARGET_LINK_LIBRARIES(pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio icuuc)
5359
ELSE(APPLE)
5460
TARGET_LINK_LIBRARIES(pfpd pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
5561
TARGET_LINK_LIBRARIES(pfpc pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
62+
TARGET_LINK_LIBRARIES(pfpc_token pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
5663
TARGET_LINK_LIBRARIES(test pfp boost_thread boost_unit_test_framework boost_regex icuio icuuc)
5764
ENDIF(APPLE)
5865

5966
INSTALL(TARGETS pfpd DESTINATION bin)
6067
INSTALL(TARGETS pfpc DESTINATION bin)
68+
INSTALL(TARGETS pfpc_token DESTINATION bin)
6169

6270
INSTALL(TARGETS pfp LIBRARY DESTINATION lib)
6371
INSTALL(DIRECTORY share/pfp DESTINATION share)

include/pfp/util.hpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -147,9 +147,17 @@ InputIterator stitch(Out & out, const node & tree, InputIterator word_it, StateL
147147
// don't output boundary
148148
if (tree.state == consts::boundary_state)
149149
return word_it;
150-
out << '(' << states[tree.state].basic_category() << ' ';
151-
if ( tree.children.empty() )
150+
151+
out << '(';
152+
if (states[tree.state].basic_category() == "" && tree.children.empty())
153+
out << *word_it;
154+
else
155+
out << states[tree.state].basic_category();
156+
out << ' ';
157+
158+
if ( tree.children.empty() ) {
152159
out << *word_it++;
160+
}
153161
else
154162
{
155163
for (std::vector< boost::shared_ptr< node > >::const_iterator it = tree.children.begin(); it != tree.children.end(); ++it)

src/pfpc/pfpc_token.cpp

+98
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
#include <iostream>
2+
#include <fstream>
3+
#include <vector>
4+
#include <stdexcept>
5+
6+
#include <boost/lexical_cast.hpp>
7+
#include <boost/algorithm/string.hpp>
8+
#include <boost/filesystem/operations.hpp>
9+
10+
#include <pfp/config.h>
11+
#include <pfp/tokenizer.h>
12+
#include <pfp/state_list.hpp>
13+
#include <pfp/lexicon.hpp>
14+
#include <pfp/unary_grammar.hpp>
15+
#include <pfp/binary_grammar.hpp>
16+
#include <pfp/binary_grammar.hpp>
17+
#include <pfp/pcfg_parser.hpp>
18+
19+
using namespace com::wavii::pfp;
20+
using namespace boost;
21+
namespace fs = boost::filesystem;
22+
23+
template<class T>
24+
void load(T & obj, boost::filesystem::path p)
25+
{
26+
if (!fs::exists(p))
27+
throw std::runtime_error("can't find " + p.string());
28+
std::ifstream in(p.string().c_str());
29+
obj.load(in);
30+
}
31+
32+
int main(int argc, char * argv[])
33+
{
34+
std::clog << "pfpc: command line interface for pfp!" << std::endl;
35+
std::clog << "build: " << __DATE__ << " (" << __TIME__ << ") of pfp version " << consts::version << " (c) Wavii,Inc. 2010" << std::endl;
36+
std::clog << "usage: " << argv[0] << " <max sentence length=45> <data dir=/usr/share/pfp/>" << std::endl;
37+
38+
size_t sentence_length = argc < 2 ? 45 : lexical_cast<size_t>(argv[1]);
39+
std::string data_dir = argc < 3 ? "/usr/share/pfp/" : argv[2]; // make install copies files to /usr/share/pfp by default
40+
41+
tokenizer tokenizer;
42+
state_list states;
43+
lexicon lexicon(states);
44+
unary_grammar ug(states);
45+
binary_grammar bg(states);
46+
pcfg_parser pcfg(states, ug, bg);
47+
48+
std::clog << "loading lexicon and grammar" << std::endl;
49+
load(tokenizer, fs::path(data_dir) / "americanizations");
50+
load(states, fs::path(data_dir) / "states");
51+
{
52+
fs::path ps[] = { fs::path(data_dir) / "words", fs::path(data_dir) / "sigs", fs::path(data_dir) / "word_state", fs::path(data_dir) / "sig_state" };
53+
std::ifstream ins[4];
54+
for (int i = 0; i != 4; ++i)
55+
{
56+
if (!fs::exists(ps[i]))
57+
throw std::runtime_error("can't find " + ps[i].string());
58+
ins[i].open(ps[i].string().c_str());
59+
}
60+
lexicon.load(ins[0], ins[1], ins[2], ins[3]);
61+
}
62+
load(ug, fs::path(data_dir) / "unary_rules");
63+
load(bg, fs::path(data_dir) / "binary_rules");
64+
workspace w(sentence_length, states.size());
65+
66+
std::vector< std::string > words;
67+
std::clog << "ready! enter lines to parse:" << std::endl;
68+
for (std::string word; std::getline(std::cin, word); ) {
69+
boost::trim(word);
70+
if (word == "")
71+
break;
72+
words.push_back(word);
73+
}
74+
75+
std::vector< std::pair< state_t, float > > state_weight;
76+
std::vector< std::vector< state_score_t > > sentence_f;
77+
node result;
78+
// tokenizer.tokenize(sentence, words);
79+
80+
for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
81+
{
82+
state_weight.clear(); lexicon.score(*it, std::back_inserter(state_weight));
83+
sentence_f.push_back(std::vector< state_score_t >(state_weight.size()));
84+
// scale by score_resolution in case we are downcasting our weights
85+
for (size_t i = 0; i != state_weight.size(); ++i)
86+
sentence_f.back()[i] = state_score_t(state_weight[i].first, state_weight[i].second * consts::score_resolution);
87+
}
88+
// add the boundary symbol
89+
sentence_f.push_back( std::vector< state_score_t >(1, state_score_t(consts::boundary_state, 0.0f)));
90+
// and parse!
91+
if (!pcfg.parse(sentence_f, w, result))
92+
std::cout << "ERRROR!" << std::endl;
93+
// stitch together the results
94+
std::ostringstream oss;
95+
std::vector< std::string >::iterator word_it = words.begin();
96+
stitch(oss, result, word_it, states);
97+
std::cout << oss.str() << std::endl;
98+
}

src/test/pfp.cpp

+95
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
2+
#include <boost/test/unit_test.hpp>
3+
#include <boost/test/test_tools.hpp>
4+
5+
#include <iostream>
6+
#include <fstream>
7+
#include <vector>
8+
#include <stdexcept>
9+
10+
#include <boost/lexical_cast.hpp>
11+
#include <boost/filesystem/operations.hpp>
12+
13+
#include <pfp/config.h>
14+
#include <pfp/tokenizer.h>
15+
#include <pfp/state_list.hpp>
16+
#include <pfp/lexicon.hpp>
17+
#include <pfp/unary_grammar.hpp>
18+
#include <pfp/binary_grammar.hpp>
19+
#include <pfp/binary_grammar.hpp>
20+
#include <pfp/pcfg_parser.hpp>
21+
22+
using namespace com::wavii::pfp;
23+
using namespace boost;
24+
namespace fs = boost::filesystem;
25+
26+
template<class T>
27+
void load(T & obj, boost::filesystem::path p)
28+
{
29+
if (!fs::exists(p))
30+
throw std::runtime_error("can't find " + p.string());
31+
std::ifstream in(p.string().c_str());
32+
obj.load(in);
33+
}
34+
35+
BOOST_AUTO_TEST_SUITE( pfp_parser_test )
36+
37+
BOOST_AUTO_TEST_CASE( test_pfp_hash )
38+
{
39+
tokenizer tokenizer;
40+
state_list states;
41+
lexicon lexicon(states);
42+
unary_grammar ug(states);
43+
binary_grammar bg(states);
44+
pcfg_parser pcfg(states, ug, bg);
45+
46+
const std::string data_dir = "./share/pfp";
47+
load(tokenizer, fs::path(data_dir) / "americanizations");
48+
load(states, fs::path(data_dir) / "states");
49+
{
50+
fs::path ps[] = { fs::path(data_dir) / "words", fs::path(data_dir) / "sigs", fs::path(data_dir) / "word_state", fs::path(data_dir) / "sig_state" };
51+
std::ifstream ins[4];
52+
for (int i = 0; i != 4; ++i)
53+
{
54+
if (!fs::exists(ps[i]))
55+
throw std::runtime_error("can't find " + ps[i].string());
56+
ins[i].open(ps[i].string().c_str());
57+
}
58+
lexicon.load(ins[0], ins[1], ins[2], ins[3]);
59+
}
60+
load(ug, fs::path(data_dir) / "unary_rules");
61+
load(bg, fs::path(data_dir) / "binary_rules");
62+
workspace w(45, states.size());
63+
64+
const std::string sentence= "Description This 2005 Nissan Altima available from Rama Auto Inc with Stock # 330051 is priced at $ 9500.00 .";
65+
const std::string parsed = "(ROOT (S (S (VP (VBG Description) (NP (DT This) (CD 2005) (NNP Nissan) (NNP Altima)) (ADJP (JJ available) (PP (IN from) (NP (NP (NNP Rama) (NNP Auto) (NNP Inc)) (PP (IN with) (NP (NNP Stock)))))))) (NP (# #) (CD 330051)) (VP (VBZ is) (VP-VBN-v (VBN priced) (PP (IN at) (NP ($ $) (CD 9500.00))))) (. .)) )";
66+
67+
{
68+
std::vector< std::string > words;
69+
std::vector< std::pair< state_t, float > > state_weight;
70+
std::vector< std::vector< state_score_t > > sentence_f;
71+
node result;
72+
tokenizer.tokenize(sentence, words);
73+
for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
74+
{
75+
state_weight.clear(); lexicon.score(*it, std::back_inserter(state_weight));
76+
sentence_f.push_back(std::vector< state_score_t >(state_weight.size()));
77+
// scale by score_resolution in case we are downcasting our weights
78+
for (size_t i = 0; i != state_weight.size(); ++i)
79+
sentence_f.back()[i] = state_score_t(state_weight[i].first, state_weight[i].second * consts::score_resolution);
80+
}
81+
// add the boundary symbol
82+
sentence_f.push_back( std::vector< state_score_t >(1, state_score_t(consts::boundary_state, 0.0f)));
83+
// and parse!
84+
if (!pcfg.parse(sentence_f, w, result))
85+
BOOST_FAIL("Parsing shouldn't fail!");
86+
// stitch together the results
87+
std::ostringstream oss;
88+
std::vector< std::string >::iterator word_it = words.begin();
89+
stitch(oss, result, word_it, states);
90+
91+
BOOST_CHECK_EQUAL(oss.str(), parsed);
92+
}
93+
}
94+
95+
BOOST_AUTO_TEST_SUITE_END()

0 commit comments

Comments
 (0)