bug fix and new executable added for testing

yifanzhang · yifanzhang · commit b0164ef1c746 · 2012-01-26T14:23:30.000-08:00
fixed hash bug '( #)' in output where it should be '(# #)'
added pfpc_token to take tokens instead sentence as input
added testing for this fix
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -25,13 +25,18 @@ ADD_EXECUTABLE(test
                src/test/lexicon.cpp
                src/test/pcfg_parser.cpp
                src/test/tokenizer.cpp
+               src/test/pfp.cpp
                src/test/main.cpp
                )
 
 ADD_EXECUTABLE(pfpc
                src/pfpc/main.cpp
                )
 
+ADD_EXECUTABLE(pfpc_token
+               src/pfpc/pfpc_token.cpp
+               )
+
 ADD_LIBRARY(pfp SHARED
             src/pfp/config
             src/pfp/tokenizer.yy
@@ -48,16 +53,19 @@ ADD_EXECUTABLE(pfpd
 IF(APPLE)
    TARGET_LINK_LIBRARIES(pfpd pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
    TARGET_LINK_LIBRARIES(pfpc pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
+   TARGET_LINK_LIBRARIES(pfpc_token pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio)
    TARGET_LINK_LIBRARIES(test pfp boost_thread-mt boost_unit_test_framework-mt boost_regex-mt icuio)
    TARGET_LINK_LIBRARIES(pfp boost_filesystem-mt boost_thread-mt boost_regex-mt boost_system-mt icuio icuuc)
 ELSE(APPLE)
    TARGET_LINK_LIBRARIES(pfpd pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
    TARGET_LINK_LIBRARIES(pfpc pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
+   TARGET_LINK_LIBRARIES(pfpc_token pfp boost_filesystem boost_thread boost_regex boost_system icuio icuuc)
    TARGET_LINK_LIBRARIES(test pfp boost_thread boost_unit_test_framework boost_regex icuio icuuc)
 ENDIF(APPLE)
 
 INSTALL(TARGETS pfpd DESTINATION bin)
 INSTALL(TARGETS pfpc DESTINATION bin)
+INSTALL(TARGETS pfpc_token DESTINATION bin)
 
 INSTALL(TARGETS pfp LIBRARY DESTINATION lib)
 INSTALL(DIRECTORY share/pfp DESTINATION share)
diff --git a/include/pfp/util.hpp b/include/pfp/util.hpp
@@ -147,9 +147,17 @@ InputIterator stitch(Out & out, const node & tree, InputIterator word_it, StateL
   // don't output boundary
   if (tree.state == consts::boundary_state)
     return word_it;
-  out << '(' << states[tree.state].basic_category() << ' ';
-  if ( tree.children.empty() )
+
+  out << '(';
+  if (states[tree.state].basic_category() == "" && tree.children.empty())
+      out << *word_it;
+  else
+      out << states[tree.state].basic_category();
+  out << ' ';
+
+  if ( tree.children.empty() ) {
     out << *word_it++;
+  }
   else
   {
     for (std::vector< boost::shared_ptr< node > >::const_iterator it = tree.children.begin(); it != tree.children.end(); ++it)
diff --git a/src/pfpc/pfpc_token.cpp b/src/pfpc/pfpc_token.cpp
@@ -0,0 +1,98 @@
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdexcept>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/algorithm/string.hpp>
+#include <boost/filesystem/operations.hpp>
+
+#include <pfp/config.h>
+#include <pfp/tokenizer.h>
+#include <pfp/state_list.hpp>
+#include <pfp/lexicon.hpp>
+#include <pfp/unary_grammar.hpp>
+#include <pfp/binary_grammar.hpp>
+#include <pfp/binary_grammar.hpp>
+#include <pfp/pcfg_parser.hpp>
+
+using namespace com::wavii::pfp;
+using namespace boost;
+namespace fs = boost::filesystem;
+
+template<class T>
+void load(T & obj, boost::filesystem::path p)
+{
+  if (!fs::exists(p))
+    throw std::runtime_error("can't find " + p.string());
+  std::ifstream in(p.string().c_str());
+  obj.load(in);
+}
+
+int main(int argc, char * argv[])
+{
+  std::clog << "pfpc: command line interface for pfp!" << std::endl;
+  std::clog << "build: " << __DATE__ << " (" << __TIME__ << ") of pfp version " << consts::version << " (c) Wavii,Inc. 2010" << std::endl;
+  std::clog << "usage: " << argv[0] << " <max sentence length=45> <data dir=/usr/share/pfp/>" << std::endl;
+
+  size_t sentence_length = argc < 2 ? 45 : lexical_cast<size_t>(argv[1]);
+  std::string data_dir = argc < 3 ? "/usr/share/pfp/" : argv[2]; // make install copies files to /usr/share/pfp by default
+
+  tokenizer tokenizer;
+  state_list states;
+  lexicon lexicon(states);
+  unary_grammar ug(states);
+  binary_grammar bg(states);
+  pcfg_parser pcfg(states, ug, bg);
+
+  std::clog << "loading lexicon and grammar" << std::endl;
+  load(tokenizer, fs::path(data_dir) / "americanizations");
+  load(states, fs::path(data_dir) / "states");
+  {
+    fs::path ps[] = { fs::path(data_dir) / "words", fs::path(data_dir) / "sigs", fs::path(data_dir) / "word_state", fs::path(data_dir) / "sig_state" };
+    std::ifstream ins[4];
+    for (int i = 0; i != 4; ++i)
+    {
+      if (!fs::exists(ps[i]))
+        throw std::runtime_error("can't find " + ps[i].string());
+      ins[i].open(ps[i].string().c_str());
+    }
+    lexicon.load(ins[0], ins[1], ins[2], ins[3]);
+  }
+  load(ug, fs::path(data_dir) / "unary_rules");
+  load(bg, fs::path(data_dir) / "binary_rules");
+  workspace w(sentence_length, states.size());
+
+  std::vector< std::string > words;
+  std::clog << "ready!  enter lines to parse:" << std::endl;
+  for (std::string word; std::getline(std::cin, word); ) {
+    boost::trim(word);
+    if (word == "")
+      break;
+    words.push_back(word);
+  }
+
+  std::vector< std::pair< state_t, float > > state_weight;
+  std::vector< std::vector< state_score_t > > sentence_f;
+  node result;
+  // tokenizer.tokenize(sentence, words);
+
+  for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
+  {
+    state_weight.clear(); lexicon.score(*it, std::back_inserter(state_weight));
+    sentence_f.push_back(std::vector< state_score_t >(state_weight.size()));
+    // scale by score_resolution in case we are downcasting our weights
+    for (size_t i = 0; i != state_weight.size(); ++i)
+      sentence_f.back()[i] = state_score_t(state_weight[i].first, state_weight[i].second * consts::score_resolution);
+  }
+  // add the boundary symbol
+  sentence_f.push_back( std::vector< state_score_t >(1, state_score_t(consts::boundary_state, 0.0f)));
+  // and parse!
+  if (!pcfg.parse(sentence_f, w, result))
+    std::cout << "ERRROR!" << std::endl;
+  // stitch together the results
+  std::ostringstream oss;
+  std::vector< std::string >::iterator word_it = words.begin();
+  stitch(oss, result, word_it, states);
+  std::cout << oss.str() << std::endl;
+}
diff --git a/src/test/pfp.cpp b/src/test/pfp.cpp
@@ -0,0 +1,95 @@
+
+#include <boost/test/unit_test.hpp>
+#include <boost/test/test_tools.hpp>
+
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdexcept>
+
+#include <boost/lexical_cast.hpp>
+#include <boost/filesystem/operations.hpp>
+
+#include <pfp/config.h>
+#include <pfp/tokenizer.h>
+#include <pfp/state_list.hpp>
+#include <pfp/lexicon.hpp>
+#include <pfp/unary_grammar.hpp>
+#include <pfp/binary_grammar.hpp>
+#include <pfp/binary_grammar.hpp>
+#include <pfp/pcfg_parser.hpp>
+
+using namespace com::wavii::pfp;
+using namespace boost;
+namespace fs = boost::filesystem;
+
+template<class T>
+void load(T & obj, boost::filesystem::path p)
+{
+  if (!fs::exists(p))
+    throw std::runtime_error("can't find " + p.string());
+  std::ifstream in(p.string().c_str());
+  obj.load(in);
+}
+
+BOOST_AUTO_TEST_SUITE( pfp_parser_test )
+
+BOOST_AUTO_TEST_CASE( test_pfp_hash )
+{
+  tokenizer tokenizer;
+  state_list states;
+  lexicon lexicon(states);
+  unary_grammar ug(states);
+  binary_grammar bg(states);
+  pcfg_parser pcfg(states, ug, bg);
+
+  const std::string data_dir = "./share/pfp";
+  load(tokenizer, fs::path(data_dir) / "americanizations");
+  load(states, fs::path(data_dir) / "states");
+  {
+    fs::path ps[] = { fs::path(data_dir) / "words", fs::path(data_dir) / "sigs", fs::path(data_dir) / "word_state", fs::path(data_dir) / "sig_state" };
+    std::ifstream ins[4];
+    for (int i = 0; i != 4; ++i)
+    {
+      if (!fs::exists(ps[i]))
+        throw std::runtime_error("can't find " + ps[i].string());
+      ins[i].open(ps[i].string().c_str());
+    }
+    lexicon.load(ins[0], ins[1], ins[2], ins[3]);
+  }
+  load(ug, fs::path(data_dir) / "unary_rules");
+  load(bg, fs::path(data_dir) / "binary_rules");
+  workspace w(45, states.size());
+
+  const std::string sentence= "Description This 2005 Nissan Altima available from Rama Auto Inc with Stock # 330051 is priced at $ 9500.00 .";
+  const std::string parsed = "(ROOT (S (S (VP (VBG Description) (NP (DT This) (CD 2005) (NNP Nissan) (NNP Altima)) (ADJP (JJ available) (PP (IN from) (NP (NP (NNP Rama) (NNP Auto) (NNP Inc)) (PP (IN with) (NP (NNP Stock)))))))) (NP (# #) (CD 330051)) (VP (VBZ is) (VP-VBN-v (VBN priced) (PP (IN at) (NP ($ $) (CD 9500.00))))) (. .)) )";
+
+   {
+    std::vector< std::string > words;
+    std::vector< std::pair< state_t, float > > state_weight;
+    std::vector< std::vector< state_score_t > > sentence_f;
+    node result;
+    tokenizer.tokenize(sentence, words);
+    for (std::vector< std::string >::const_iterator it = words.begin(); it != words.end(); ++it)
+    {
+      state_weight.clear(); lexicon.score(*it, std::back_inserter(state_weight));
+      sentence_f.push_back(std::vector< state_score_t >(state_weight.size()));
+      // scale by score_resolution in case we are downcasting our weights
+      for (size_t i = 0; i != state_weight.size(); ++i)
+        sentence_f.back()[i] = state_score_t(state_weight[i].first, state_weight[i].second * consts::score_resolution);
+    }
+    // add the boundary symbol
+    sentence_f.push_back( std::vector< state_score_t >(1, state_score_t(consts::boundary_state, 0.0f)));
+    // and parse!
+    if (!pcfg.parse(sentence_f, w, result))
+      BOOST_FAIL("Parsing shouldn't fail!");
+    // stitch together the results
+    std::ostringstream oss;
+    std::vector< std::string >::iterator word_it = words.begin();
+    stitch(oss, result, word_it, states);
+
+    BOOST_CHECK_EQUAL(oss.str(), parsed);
+   }
+}
+
+BOOST_AUTO_TEST_SUITE_END()