diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..9dc8000 --- /dev/null +++ b/.gitignore @@ -0,0 +1,13 @@ +third_party/openfst +third_party/re2 +Makefile +autom4te.cache/ +bin/fstcompile +bin/fstminimize +bin/fstprint +bin/regex2dfa +config.log +config.status +contrib/ +*.o +*/*.o diff --git a/Makefile.in b/Makefile.in index aedbe4f..1546b16 100644 --- a/Makefile.in +++ b/Makefile.in @@ -1,35 +1,41 @@ THIRDPARTY_DIR = third_party FST_DIR = $(THIRDPARTY_DIR)/openfst -FST_BIN_DIR = $(FST_DIR)/src/bin +FST_INC_DIR = $(FST_DIR)/src/include +FST_LIB_DIR = $(FST_DIR)/src/lib/.libs +FSTSCRIPT_LIB_DIR = $(FST_DIR)/src/script/.libs RE2_DIR = $(THIRDPARTY_DIR)/re2 RE2_LIB_DIR = $(RE2_DIR)/obj RE2_INC_DIR = $(RE2_DIR) OPTIMIZATION_FLAGS = -O3 -CXXFLAGS_ = $(CXXFLAGS) $(OPTIMIZATION_FLAGS) -Wall -I. -Isrc -I$(THIRDPARTY_DIR) -I$(RE2_INC_DIR) -LDFLAGS_ = $(LDFLAGS) $(OPTIMIZATION_FLAGS) -L$(RE2_LIB_DIR) -pthread -lre2 +CXXFLAGS_ = $(CXXFLAGS) $(OPTIMIZATION_FLAGS) -std=c++0x -DUSE_CXX0X -Wall -I. -Isrc -I$(THIRDPARTY_DIR) -I$(RE2_INC_DIR) -I$(FST_INC_DIR) +LDFLAGS_ = $(LDFLAGS) $(OPTIMIZATION_FLAGS) -L$(RE2_LIB_DIR) -L$(FST_LIB_DIR) -L$(FSTSCRIPT_LIB_DIR) -pthread -lre2 -lfst -lfstscript -ldl OBJ_REGEX2DFA = src/regex2dfa.o TARGET_REGEX2DFA = bin/regex2dfa TARGET_LIBRE2 = $(RE2_LIB_DIR)/libre2.a -TARGET_FSTBIN = bin/fstcompile +TARGET_FSTLIB = $(FST_DIR)/src/lib/.libs/libfst.a TARGET_TEST = bin/test -$(TARGET_REGEX2DFA): $(TARGET_LIBRE2) $(TARGET_FSTBIN) $(OBJ_REGEX2DFA) +$(TARGET_REGEX2DFA): $(OBJ_REGEX2DFA) $(CXX) $(CXXFLAGS_) $(OBJ_REGEX2DFA) -o $@ $(LDFLAGS_) -$(TARGET_FSTBIN): - cd $(FST_DIR) && $(MAKE) - cp -fv $(FST_BIN_DIR)/fstcompile bin/ - cp -fv $(FST_BIN_DIR)/fstminimize bin/ - cp -fv $(FST_BIN_DIR)/fstprint bin/ +$(OBJ_REGEX2DFA): $(TARGET_LIBRE2) $(TARGET_FSTLIB) -$(TARGET_LIBRE2): +$(TARGET_FSTLIB): + cd $(FST_DIR) && ./configure --enable-bin --disable-shared --enable-static && $(MAKE) + +$(TARGET_LIBRE2): $(RE2_DIR)/util/logging.h.fixed cd $(RE2_DIR) && $(MAKE) obj/libre2.a +$(RE2_DIR)/util/logging.h.fixed: + sed 's/LogMessage/RE2LogMessage/g' $(RE2_DIR)/util/logging.h > $(RE2_DIR)/util/logging.h.tmp + mv $(RE2_DIR)/util/logging.h.tmp $(RE2_DIR)/util/logging.h + touch $(RE2_DIR)/util/logging.h.fixed + %.o: %.cc $(CXX) $(CXXFLAGS_) -c -o $@ $< diff --git a/README.md b/README.md index e5bb3ec..bf925f3 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ regex2dfa This is a command-line utility that converts a regular expression to a DFA. * **input**: A perl-compatible regular expression, as defined by re2 [1]. -* **output**: An AT&T FST [2], which accepts an equivelent language to the input regular expression. +* **output**: An AT&T DFA [2], which accepts an equivelent language to the input regular expression. ### References @@ -30,10 +30,15 @@ fstcompile fstminimize fstprint regex2dfa Example Usage ------------- -The language of strings of length at least one, over the alphabet ```{a, b}```. +``` +$ ./bin/regex2dfa -r "^(a|b)*$" +0 0 97 97 +0 0 98 98 +0 +``` ``` -PATH=bin:$PATH regex2dfa -r "^(a|b)+$" +$ ./bin/regex2dfa -r "^(a|b)+$" 0 1 97 97 0 1 98 98 1 1 97 97 diff --git a/bin/test b/bin/test index 791d808..4b4ebad 100755 --- a/bin/test +++ b/bin/test @@ -17,7 +17,8 @@ def doTest(bin_dir, regex_file): with open(dfa_file) as fh: expected_dfa = fh.read() - cmd = "PATH="+bin_dir+" regex2dfa -r \""+regex+"\"" + regex2dfa_binary = os.path.join(bin_dir, 'regex2dfa') + cmd = regex2dfa_binary + " -r \""+regex+"\"" actual_dfa = commands.getstatusoutput(cmd)[1] if dfasAreEqual(actual_dfa, expected_dfa): @@ -37,3 +38,6 @@ def main(): doTest(bin_dir, test_file) sys.exit(0) + +if __name__ == "__main__": + main() diff --git a/src/regex2dfa.cc b/src/regex2dfa.cc index 1d723ad..3019816 100644 --- a/src/regex2dfa.cc +++ b/src/regex2dfa.cc @@ -3,11 +3,19 @@ #include #include +#include +#include + +#include "fst/fstlib.h" +#include "fst/script/fstscript.h" #include "re2/re2.h" #include "re2/regexp.h" #include "re2/prog.h" +std::map< std::string, uint32_t > state_map; +uint32_t state_counter = 0; + bool AttFstFromRegex(const std::string & regex, std::string * dfa) { // specify compile flags for re2 re2::Regexp::ParseFlags re_flags; @@ -27,10 +35,18 @@ bool AttFstFromRegex(const std::string & regex, std::string * dfa) { try { RE2::Options opt; re2::Regexp* re = re2::Regexp::Parse( regex, re_flags, &status ); - re2::Prog* prog = re->CompileToProg( opt.max_mem() ); - (*dfa) = prog->PrintEntireDFA( re2::Prog::kFullMatch ); + if (re!=NULL) { + re2::Prog* prog = re->CompileToProg( opt.max_mem() ); + if (prog!=NULL) { + (*dfa) = prog->PrintEntireDFA( re2::Prog::kFullMatch ); + } + } } catch (int e) { - // do nothing, we return the empty string + return false; + } + + if ((*dfa)=="") { + return false; } // cleanup @@ -42,52 +58,119 @@ bool AttFstFromRegex(const std::string & regex, std::string * dfa) { return true; } -bool AttFstMinimize( std::string & str_dfa, std::string * minimized_dfa) { - const char* temp_dir = getenv("TMPDIR"); - if (temp_dir == 0) { - temp_dir = "/tmp"; +std::vector tokenize(const std::string & line, + const char & delim) { + std::vector retval; + + std::istringstream iss(line); + std::string fragment; + while(std::getline(iss, fragment, delim)) { + retval.push_back(fragment); } - std::string temp_dir_str = std::string(temp_dir); - std::string temp_file = "000000"; - std::string abspath_dfa = temp_dir_str+ "/" + temp_file + ".dfa"; - std::string abspath_fst = temp_dir_str+ "/" + temp_file + ".fst"; - std::string abspath_fst_min = temp_dir_str+ "/" + temp_file + ".min.fst"; - std::string abspath_dfa_min = temp_dir_str+ "/" + temp_file + ".min.dfa"; + return retval; +} - // write our input DFA to disk - std::ofstream dfa_stream; - dfa_stream.open (abspath_dfa.c_str()); - dfa_stream << str_dfa; - dfa_stream.close(); +bool StateExists(std::string state_label) { + return (state_map.find(state_label) != state_map.end()); +} - std::string cmd; +uint32_t AddState(std::string state_label) { + state_map.insert(std::pair(state_label, state_counter++)); + return state_counter; +} - // convert our ATT DFA string to an FST - cmd = "fstcompile " + abspath_dfa + " " + abspath_fst; - system(cmd.c_str()); +uint32_t StateLookup(std::string state_label) { + return state_map.at(state_label); +} - // convert our FST to a minmized FST - cmd = "fstminimize " + abspath_fst + " " + abspath_fst_min; - system(cmd.c_str()); +bool CreateFst(const std::string & str_dfa, + fst::script::FstClass * input_fst) { - // covert our minimized FST to an ATT FST string - cmd = "fstprint " + abspath_fst_min + " " + abspath_dfa_min; - system(cmd.c_str()); + fst::StdVectorFst fst; - // read the contents of of the file at abspath_dfa_min to our retval - std::ifstream dfa_min_stream(abspath_dfa_min.c_str()); - std::stringstream buffer; - buffer << dfa_min_stream.rdbuf(); - dfa_min_stream.close(); + bool startStateIsntSet = true; + std::string line; + std::istringstream my_str_stream(str_dfa); + while ( getline (my_str_stream,line) ) { + if (line.empty()) { + break; + } - (*minimized_dfa) = std::string(buffer.str()); + std::vector split_vec = tokenize(line, ' '); + if (4 == split_vec.size()) { + if(!StateExists(split_vec.at(0))) { + fst.AddState(); + AddState(split_vec.at(0)); + } + if(!StateExists(split_vec.at(1))) { + fst.AddState(); + AddState(split_vec.at(1)); + } + fst.AddArc(StateLookup(split_vec.at(0)), + fst::StdArc(atoi(split_vec.at(2).c_str()), + atoi(split_vec.at(3).c_str()), + 0, + StateLookup(split_vec.at(1)))); + } else if (1 == split_vec.size()) { + if(!StateExists(split_vec.at(0))) { + fst.AddState(); + } + uint32_t final_state = StateLookup(split_vec.at(0)); + fst.SetFinal(final_state, 0); + } + } - // cleanup - remove( abspath_dfa.c_str() ); - remove( abspath_fst.c_str() ); - remove( abspath_fst_min.c_str() ); - remove( abspath_dfa_min.c_str() ); + fst.SetStart(0); + + *input_fst = static_cast(fst); + + return true; +} + +bool FormatFst(const std::string & str_dfa, + std::string * formatted_dfa) { + + std::string & retval = (*formatted_dfa); + + std::string line; + std::istringstream my_str_stream(str_dfa); + while ( getline (my_str_stream,line) ) { + if (line.empty()) { + break; + } + + std::vector split_vec = tokenize(line, '\t'); + if (4 == split_vec.size()) { + retval += split_vec.at(0); + retval += "\t" + split_vec.at(1); + retval += "\t" + split_vec.at(2); + retval += "\t" + split_vec.at(2); + retval += "\n"; + } else if (2 == split_vec.size()) { + retval += split_vec.at(0); + retval += "\n"; + } + } + + return true; +} + +bool AttFstMinimize(const std::string & str_dfa, + std::string * minimized_dfa) { + + fst::script::FstClass * fst = new fst::script::FstClass(); + + CreateFst(str_dfa, fst); + + fst::script::MutableFstClass * mutable_fst + = static_cast(fst); + fst::script::Minimize(mutable_fst); + + std::ostringstream ostrm; + fst::script::PrintFst(*fst, ostrm, "", NULL, NULL, NULL, true, true); + + FormatFst(ostrm.str(), minimized_dfa); return true; } @@ -116,8 +199,17 @@ int main (int argc, char **argv) { std::string dfa; std::string minimized_dfa; - AttFstFromRegex(input_regex, &dfa); - AttFstMinimize(dfa, &minimized_dfa); + bool compile_success = AttFstFromRegex(input_regex, &dfa); + + if (compile_success) { + bool minimize_success = AttFstMinimize(dfa, &minimized_dfa); + } else { + std::cerr << "\033[1;31mERROR\033[0m"; + std::cerr << ": Failed to compile regex: \"" + input_regex + "\""; + std::cerr << std::endl; + return 1; + } std::cout << minimized_dfa << std::endl; + return 0; } diff --git a/third_party/re2/util/logging.h b/third_party/re2/util/logging.h index 4443f7c..f534945 100644 --- a/third_party/re2/util/logging.h +++ b/third_party/re2/util/logging.h @@ -20,7 +20,7 @@ #define DCHECK_GT(val1, val2) assert((val1) > (val2)) // Always-on checking -#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x +#define CHECK(x) if(x){}else RE2RE2LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x #define CHECK_LT(x, y) CHECK((x) < (y)) #define CHECK_GT(x, y) CHECK((x) > (y)) #define CHECK_LE(x, y) CHECK((x) <= (y)) @@ -28,10 +28,10 @@ #define CHECK_EQ(x, y) CHECK((x) == (y)) #define CHECK_NE(x, y) CHECK((x) != (y)) -#define LOG_INFO LogMessage(__FILE__, __LINE__) +#define LOG_INFO RE2RE2LogMessage(__FILE__, __LINE__) #define LOG_ERROR LOG_INFO #define LOG_WARNING LOG_INFO -#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__) +#define LOG_FATAL RE2RE2LogMessageFatal(__FILE__, __LINE__) #define LOG_QFATAL LOG_FATAL #define VLOG(x) if((x)>0){}else LOG_INFO.stream() @@ -46,9 +46,9 @@ #define LOG(severity) LOG_ ## severity.stream() -class LogMessage { +class RE2RE2LogMessage { public: - LogMessage(const char* file, int line) : flushed_(false) { + RE2RE2LogMessage(const char* file, int line) : flushed_(false) { stream() << file << ":" << line << ": "; } void Flush() { @@ -58,7 +58,7 @@ class LogMessage { if(write(2, s.data(), n) < 0) {} // shut up gcc flushed_ = true; } - ~LogMessage() { + ~RE2RE2LogMessage() { if (!flushed_) { Flush(); } @@ -68,19 +68,19 @@ class LogMessage { private: bool flushed_; std::ostringstream str_; - DISALLOW_EVIL_CONSTRUCTORS(LogMessage); + DISALLOW_EVIL_CONSTRUCTORS(RE2RE2LogMessage); }; -class LogMessageFatal : public LogMessage { +class RE2RE2LogMessageFatal : public RE2RE2LogMessage { public: - LogMessageFatal(const char* file, int line) - : LogMessage(file, line) { } - ~LogMessageFatal() { + RE2RE2LogMessageFatal(const char* file, int line) + : RE2RE2LogMessage(file, line) { } + ~RE2RE2LogMessageFatal() { Flush(); abort(); } private: - DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal); + DISALLOW_EVIL_CONSTRUCTORS(RE2RE2LogMessageFatal); }; #endif // RE2_UTIL_LOGGING_H__