Skip to content

Commit

Permalink
Merge pull request #4 from kpdyer/kpdyer-issue1
Browse files Browse the repository at this point in the history
Make regex2dfa binary stand-alone
  • Loading branch information
kpdyer committed Jun 19, 2014
2 parents b4976d5 + 357b9a3 commit ea4633b
Show file tree
Hide file tree
Showing 6 changed files with 188 additions and 68 deletions.
13 changes: 13 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
third_party/openfst
third_party/re2
Makefile
autom4te.cache/
bin/fstcompile
bin/fstminimize
bin/fstprint
bin/regex2dfa
config.log
config.status
contrib/
*.o
*/*.o
28 changes: 17 additions & 11 deletions Makefile.in
Original file line number Diff line number Diff line change
@@ -1,35 +1,41 @@
THIRDPARTY_DIR = third_party

FST_DIR = $(THIRDPARTY_DIR)/openfst
FST_BIN_DIR = $(FST_DIR)/src/bin
FST_INC_DIR = $(FST_DIR)/src/include
FST_LIB_DIR = $(FST_DIR)/src/lib/.libs
FSTSCRIPT_LIB_DIR = $(FST_DIR)/src/script/.libs

RE2_DIR = $(THIRDPARTY_DIR)/re2
RE2_LIB_DIR = $(RE2_DIR)/obj
RE2_INC_DIR = $(RE2_DIR)

OPTIMIZATION_FLAGS = -O3
CXXFLAGS_ = $(CXXFLAGS) $(OPTIMIZATION_FLAGS) -Wall -I. -Isrc -I$(THIRDPARTY_DIR) -I$(RE2_INC_DIR)
LDFLAGS_ = $(LDFLAGS) $(OPTIMIZATION_FLAGS) -L$(RE2_LIB_DIR) -pthread -lre2
CXXFLAGS_ = $(CXXFLAGS) $(OPTIMIZATION_FLAGS) -std=c++0x -DUSE_CXX0X -Wall -I. -Isrc -I$(THIRDPARTY_DIR) -I$(RE2_INC_DIR) -I$(FST_INC_DIR)
LDFLAGS_ = $(LDFLAGS) $(OPTIMIZATION_FLAGS) -L$(RE2_LIB_DIR) -L$(FST_LIB_DIR) -L$(FSTSCRIPT_LIB_DIR) -pthread -lre2 -lfst -lfstscript -ldl

OBJ_REGEX2DFA = src/regex2dfa.o

TARGET_REGEX2DFA = bin/regex2dfa
TARGET_LIBRE2 = $(RE2_LIB_DIR)/libre2.a
TARGET_FSTBIN = bin/fstcompile
TARGET_FSTLIB = $(FST_DIR)/src/lib/.libs/libfst.a
TARGET_TEST = bin/test

$(TARGET_REGEX2DFA): $(TARGET_LIBRE2) $(TARGET_FSTBIN) $(OBJ_REGEX2DFA)
$(TARGET_REGEX2DFA): $(OBJ_REGEX2DFA)
$(CXX) $(CXXFLAGS_) $(OBJ_REGEX2DFA) -o $@ $(LDFLAGS_)

$(TARGET_FSTBIN):
cd $(FST_DIR) && $(MAKE)
cp -fv $(FST_BIN_DIR)/fstcompile bin/
cp -fv $(FST_BIN_DIR)/fstminimize bin/
cp -fv $(FST_BIN_DIR)/fstprint bin/
$(OBJ_REGEX2DFA): $(TARGET_LIBRE2) $(TARGET_FSTLIB)

$(TARGET_LIBRE2):
$(TARGET_FSTLIB):
cd $(FST_DIR) && ./configure --enable-bin --disable-shared --enable-static && $(MAKE)

$(TARGET_LIBRE2): $(RE2_DIR)/util/logging.h.fixed
cd $(RE2_DIR) && $(MAKE) obj/libre2.a

$(RE2_DIR)/util/logging.h.fixed:
sed 's/LogMessage/RE2LogMessage/g' $(RE2_DIR)/util/logging.h > $(RE2_DIR)/util/logging.h.tmp
mv $(RE2_DIR)/util/logging.h.tmp $(RE2_DIR)/util/logging.h
touch $(RE2_DIR)/util/logging.h.fixed

%.o: %.cc
$(CXX) $(CXXFLAGS_) -c -o $@ $<

Expand Down
11 changes: 8 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ regex2dfa
This is a command-line utility that converts a regular expression to a DFA.

* **input**: A perl-compatible regular expression, as defined by re2 [1].
* **output**: An AT&T FST [2], which accepts an equivelent language to the input regular expression.
* **output**: An AT&T DFA [2], which accepts an equivelent language to the input regular expression.

### References

Expand All @@ -30,10 +30,15 @@ fstcompile fstminimize fstprint regex2dfa
Example Usage
-------------

The language of strings of length at least one, over the alphabet ```{a, b}```.
```
$ ./bin/regex2dfa -r "^(a|b)*$"
0 0 97 97
0 0 98 98
0
```

```
PATH=bin:$PATH regex2dfa -r "^(a|b)+$"
$ ./bin/regex2dfa -r "^(a|b)+$"
0 1 97 97
0 1 98 98
1 1 97 97
Expand Down
6 changes: 5 additions & 1 deletion bin/test
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ def doTest(bin_dir, regex_file):
with open(dfa_file) as fh:
expected_dfa = fh.read()

cmd = "PATH="+bin_dir+" regex2dfa -r \""+regex+"\""
regex2dfa_binary = os.path.join(bin_dir, 'regex2dfa')
cmd = regex2dfa_binary + " -r \""+regex+"\""
actual_dfa = commands.getstatusoutput(cmd)[1]

if dfasAreEqual(actual_dfa, expected_dfa):
Expand All @@ -37,3 +38,6 @@ def main():
doTest(bin_dir, test_file)

sys.exit(0)

if __name__ == "__main__":
main()
174 changes: 133 additions & 41 deletions src/regex2dfa.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,19 @@

#include <iostream>
#include <fstream>
#include <cstdlib>
#include <map>

#include "fst/fstlib.h"
#include "fst/script/fstscript.h"

#include "re2/re2.h"
#include "re2/regexp.h"
#include "re2/prog.h"

std::map< std::string, uint32_t > state_map;
uint32_t state_counter = 0;

bool AttFstFromRegex(const std::string & regex, std::string * dfa) {
// specify compile flags for re2
re2::Regexp::ParseFlags re_flags;
Expand All @@ -27,10 +35,18 @@ bool AttFstFromRegex(const std::string & regex, std::string * dfa) {
try {
RE2::Options opt;
re2::Regexp* re = re2::Regexp::Parse( regex, re_flags, &status );
re2::Prog* prog = re->CompileToProg( opt.max_mem() );
(*dfa) = prog->PrintEntireDFA( re2::Prog::kFullMatch );
if (re!=NULL) {
re2::Prog* prog = re->CompileToProg( opt.max_mem() );
if (prog!=NULL) {
(*dfa) = prog->PrintEntireDFA( re2::Prog::kFullMatch );
}
}
} catch (int e) {
// do nothing, we return the empty string
return false;
}

if ((*dfa)=="") {
return false;
}

// cleanup
Expand All @@ -42,52 +58,119 @@ bool AttFstFromRegex(const std::string & regex, std::string * dfa) {
return true;
}

bool AttFstMinimize( std::string & str_dfa, std::string * minimized_dfa) {
const char* temp_dir = getenv("TMPDIR");
if (temp_dir == 0) {
temp_dir = "/tmp";
std::vector<std::string> tokenize(const std::string & line,
const char & delim) {
std::vector<std::string> retval;

std::istringstream iss(line);
std::string fragment;
while(std::getline(iss, fragment, delim)) {
retval.push_back(fragment);
}
std::string temp_dir_str = std::string(temp_dir);
std::string temp_file = "000000";

std::string abspath_dfa = temp_dir_str+ "/" + temp_file + ".dfa";
std::string abspath_fst = temp_dir_str+ "/" + temp_file + ".fst";
std::string abspath_fst_min = temp_dir_str+ "/" + temp_file + ".min.fst";
std::string abspath_dfa_min = temp_dir_str+ "/" + temp_file + ".min.dfa";
return retval;
}

// write our input DFA to disk
std::ofstream dfa_stream;
dfa_stream.open (abspath_dfa.c_str());
dfa_stream << str_dfa;
dfa_stream.close();
bool StateExists(std::string state_label) {
return (state_map.find(state_label) != state_map.end());
}

std::string cmd;
uint32_t AddState(std::string state_label) {
state_map.insert(std::pair<std::string, uint32_t>(state_label, state_counter++));
return state_counter;
}

// convert our ATT DFA string to an FST
cmd = "fstcompile " + abspath_dfa + " " + abspath_fst;
system(cmd.c_str());
uint32_t StateLookup(std::string state_label) {
return state_map.at(state_label);
}

// convert our FST to a minmized FST
cmd = "fstminimize " + abspath_fst + " " + abspath_fst_min;
system(cmd.c_str());
bool CreateFst(const std::string & str_dfa,
fst::script::FstClass * input_fst) {

// covert our minimized FST to an ATT FST string
cmd = "fstprint " + abspath_fst_min + " " + abspath_dfa_min;
system(cmd.c_str());
fst::StdVectorFst fst;

// read the contents of of the file at abspath_dfa_min to our retval
std::ifstream dfa_min_stream(abspath_dfa_min.c_str());
std::stringstream buffer;
buffer << dfa_min_stream.rdbuf();
dfa_min_stream.close();
bool startStateIsntSet = true;
std::string line;
std::istringstream my_str_stream(str_dfa);
while ( getline (my_str_stream,line) ) {
if (line.empty()) {
break;
}

(*minimized_dfa) = std::string(buffer.str());
std::vector<std::string> split_vec = tokenize(line, ' ');
if (4 == split_vec.size()) {
if(!StateExists(split_vec.at(0))) {
fst.AddState();
AddState(split_vec.at(0));
}
if(!StateExists(split_vec.at(1))) {
fst.AddState();
AddState(split_vec.at(1));
}
fst.AddArc(StateLookup(split_vec.at(0)),
fst::StdArc(atoi(split_vec.at(2).c_str()),
atoi(split_vec.at(3).c_str()),
0,
StateLookup(split_vec.at(1))));
} else if (1 == split_vec.size()) {
if(!StateExists(split_vec.at(0))) {
fst.AddState();
}
uint32_t final_state = StateLookup(split_vec.at(0));
fst.SetFinal(final_state, 0);
}
}

// cleanup
remove( abspath_dfa.c_str() );
remove( abspath_fst.c_str() );
remove( abspath_fst_min.c_str() );
remove( abspath_dfa_min.c_str() );
fst.SetStart(0);

*input_fst = static_cast<fst::script::FstClass>(fst);

return true;
}

bool FormatFst(const std::string & str_dfa,
std::string * formatted_dfa) {

std::string & retval = (*formatted_dfa);

std::string line;
std::istringstream my_str_stream(str_dfa);
while ( getline (my_str_stream,line) ) {
if (line.empty()) {
break;
}

std::vector<std::string> split_vec = tokenize(line, '\t');
if (4 == split_vec.size()) {
retval += split_vec.at(0);
retval += "\t" + split_vec.at(1);
retval += "\t" + split_vec.at(2);
retval += "\t" + split_vec.at(2);
retval += "\n";
} else if (2 == split_vec.size()) {
retval += split_vec.at(0);
retval += "\n";
}
}

return true;
}

bool AttFstMinimize(const std::string & str_dfa,
std::string * minimized_dfa) {

fst::script::FstClass * fst = new fst::script::FstClass();

CreateFst(str_dfa, fst);

fst::script::MutableFstClass * mutable_fst
= static_cast<fst::script::MutableFstClass*>(fst);
fst::script::Minimize(mutable_fst);

std::ostringstream ostrm;
fst::script::PrintFst(*fst, ostrm, "", NULL, NULL, NULL, true, true);

FormatFst(ostrm.str(), minimized_dfa);

return true;
}
Expand Down Expand Up @@ -116,8 +199,17 @@ int main (int argc, char **argv) {
std::string dfa;
std::string minimized_dfa;

AttFstFromRegex(input_regex, &dfa);
AttFstMinimize(dfa, &minimized_dfa);
bool compile_success = AttFstFromRegex(input_regex, &dfa);

if (compile_success) {
bool minimize_success = AttFstMinimize(dfa, &minimized_dfa);
} else {
std::cerr << "\033[1;31mERROR\033[0m";
std::cerr << ": Failed to compile regex: \"" + input_regex + "\"";
std::cerr << std::endl;
return 1;
}

std::cout << minimized_dfa << std::endl;
return 0;
}
24 changes: 12 additions & 12 deletions third_party/re2/util/logging.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,18 @@
#define DCHECK_GT(val1, val2) assert((val1) > (val2))

// Always-on checking
#define CHECK(x) if(x){}else LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK(x) if(x){}else RE2RE2LogMessageFatal(__FILE__, __LINE__).stream() << "Check failed: " #x
#define CHECK_LT(x, y) CHECK((x) < (y))
#define CHECK_GT(x, y) CHECK((x) > (y))
#define CHECK_LE(x, y) CHECK((x) <= (y))
#define CHECK_GE(x, y) CHECK((x) >= (y))
#define CHECK_EQ(x, y) CHECK((x) == (y))
#define CHECK_NE(x, y) CHECK((x) != (y))

#define LOG_INFO LogMessage(__FILE__, __LINE__)
#define LOG_INFO RE2RE2LogMessage(__FILE__, __LINE__)
#define LOG_ERROR LOG_INFO
#define LOG_WARNING LOG_INFO
#define LOG_FATAL LogMessageFatal(__FILE__, __LINE__)
#define LOG_FATAL RE2RE2LogMessageFatal(__FILE__, __LINE__)
#define LOG_QFATAL LOG_FATAL

#define VLOG(x) if((x)>0){}else LOG_INFO.stream()
Expand All @@ -46,9 +46,9 @@

#define LOG(severity) LOG_ ## severity.stream()

class LogMessage {
class RE2RE2LogMessage {
public:
LogMessage(const char* file, int line) : flushed_(false) {
RE2RE2LogMessage(const char* file, int line) : flushed_(false) {
stream() << file << ":" << line << ": ";
}
void Flush() {
Expand All @@ -58,7 +58,7 @@ class LogMessage {
if(write(2, s.data(), n) < 0) {} // shut up gcc
flushed_ = true;
}
~LogMessage() {
~RE2RE2LogMessage() {
if (!flushed_) {
Flush();
}
Expand All @@ -68,19 +68,19 @@ class LogMessage {
private:
bool flushed_;
std::ostringstream str_;
DISALLOW_EVIL_CONSTRUCTORS(LogMessage);
DISALLOW_EVIL_CONSTRUCTORS(RE2RE2LogMessage);
};

class LogMessageFatal : public LogMessage {
class RE2RE2LogMessageFatal : public RE2RE2LogMessage {
public:
LogMessageFatal(const char* file, int line)
: LogMessage(file, line) { }
~LogMessageFatal() {
RE2RE2LogMessageFatal(const char* file, int line)
: RE2RE2LogMessage(file, line) { }
~RE2RE2LogMessageFatal() {
Flush();
abort();
}
private:
DISALLOW_EVIL_CONSTRUCTORS(LogMessageFatal);
DISALLOW_EVIL_CONSTRUCTORS(RE2RE2LogMessageFatal);
};

#endif // RE2_UTIL_LOGGING_H__

0 comments on commit ea4633b

Please sign in to comment.