diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index bae30ab8bb..7db0c612d7 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -146,6 +146,9 @@ static void ExtractFontName(const char* filename, std::string* fontname) { */ static void addAvailableLanguages(const std::string &datadir, std::vector *langs) { + if (!std::filesystem::is_directory(datadir)) + return; + for (const auto& entry : std::filesystem::recursive_directory_iterator(datadir, std::filesystem::directory_options::follow_directory_symlink | @@ -347,7 +350,7 @@ int TessBaseAPI::Init(const char *data, int data_size, const char *language, Ocr // Update datapath and language requested for the last valid initialization. datapath_ = std::move(datapath); if (datapath_.empty() && !tesseract_->datadir.empty()) { - datapath_ = tesseract_->datadir; + datapath_ = tesseract_->datadir.string(); } language_ = language; @@ -396,7 +399,7 @@ void TessBaseAPI::GetLoadedLanguagesAsVector(std::vector *langs) co void TessBaseAPI::GetAvailableLanguagesAsVector(std::vector *langs) const { langs->clear(); if (tesseract_ != nullptr) { - addAvailableLanguages(tesseract_->datadir, langs); + addAvailableLanguages(tesseract_->datadir.string(), langs); std::sort(langs->begin(), langs->end()); } } @@ -858,7 +861,7 @@ const char *TessBaseAPI::GetInputName() { } const char *TessBaseAPI::GetDatapath() { - return tesseract_->datadir.c_str(); + return datapath_.c_str(); } int TessBaseAPI::GetSourceYResolution() { diff --git a/src/ccmain/paramsd.cpp b/src/ccmain/paramsd.cpp index 14f220f8e6..44093c3a9a 100644 --- a/src/ccmain/paramsd.cpp +++ b/src/ccmain/paramsd.cpp @@ -298,7 +298,7 @@ ParamsEditor::ParamsEditor(tesseract::Tesseract *tess, ScrollView *sv) { SVMenuNode *svMenuRoot = BuildListOfAllLeaves(tess); std::string paramfile; - paramfile = tess->datadir; + paramfile = tess->datadir.string(); paramfile += VARDIR; // parameters dir paramfile += "edited"; // actual name diff --git a/src/ccmain/tessedit.cpp b/src/ccmain/tessedit.cpp index c751888359..73c4e83514 100644 --- a/src/ccmain/tessedit.cpp +++ b/src/ccmain/tessedit.cpp @@ -29,6 +29,7 @@ #include "params.h" #include "stopper.h" #include "tesseractclass.h" +#include "tesserrstream.h" // for tesserr #include "tessvars.h" #include "tprintf.h" #ifndef DISABLED_LEGACY_ENGINE @@ -43,24 +44,25 @@ namespace tesseract { // Read a "config" file containing a set of variable, value pairs. // Searches the standard places: tessdata/configs, tessdata/tessconfigs // and also accepts a relative or absolute path name. -void Tesseract::read_config_file(const char *filename, SetParamConstraint constraint) { - std::string path = datadir; - path += "configs/"; - path += filename; - FILE *fp; - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = datadir; - path += "tessconfigs/"; - path += filename; - if ((fp = fopen(path.c_str(), "rb")) != nullptr) { - fclose(fp); - } else { - path = filename; - } - } - ParamUtils::ReadParamsFile(path.c_str(), constraint, this->params()); +void Tesseract::read_config_file(const char *filename, + SetParamConstraint constraint) { + // Construct potential config file paths + std::vector config_paths = { + datadir / "configs" / filename, + datadir / "tessconfigs" / filename, + std::filesystem::path(filename)}; + + // Use the first existing file or fallback to the last (filename) + auto config_file = std::find_if(config_paths.begin(), config_paths.end(), + [](const std::filesystem::path &path) { + std::error_code ec; + return std::filesystem::exists(path, ec); + }); + const std::filesystem::path &selected_path = + (config_file != config_paths.end()) ? *config_file : config_paths.back(); + + ParamUtils::ReadParamsFile(selected_path.string().c_str(), constraint, + this->params()); } // Returns false if a unicharset file for the specified language was not found @@ -81,17 +83,13 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, bool set_only_non_debug_params, TessdataManager *mgr) { // Set the language data path prefix lang = !language.empty() ? language : "eng"; - language_data_path_prefix = datadir; - language_data_path_prefix += lang; - language_data_path_prefix += "."; + std::filesystem::path tessdata_path = datadir / (lang + "." + kTrainedDataSuffix); // Initialize TessdataManager. - std::string tessdata_path = language_data_path_prefix + kTrainedDataSuffix; - if (!mgr->is_loaded() && !mgr->Init(tessdata_path.c_str())) { - tprintf("Error opening data file %s\n", tessdata_path.c_str()); - tprintf( + if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string().c_str())) { + tesserr << "Error opening data file " << tessdata_path.string() << '\n' << "Please make sure the TESSDATA_PREFIX environment variable is set" - " to your \"tessdata\" directory.\n"); + " to your \"tessdata\" directory.\n"; return false; } #ifdef DISABLED_LEGACY_ENGINE @@ -184,10 +182,8 @@ bool Tesseract::init_tesseract_lang_data(const std::string &arg0, } #ifndef DISABLED_LEGACY_ENGINE else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || !unicharset.load_from_file(&fp, false)) { - tprintf( - "Error: Tesseract (legacy) engine requested, but components are " - "not present in %s!!\n", - tessdata_path.c_str()); + tesserr << "Error: Tesseract (legacy) engine requested, but components are " + "not present in " << tessdata_path.string() << "!!\n"; return false; } #endif // ndef DISABLED_LEGACY_ENGINE diff --git a/src/ccutil/ccutil.cpp b/src/ccutil/ccutil.cpp index 930aa2636e..5e4f2a8166 100644 --- a/src/ccutil/ccutil.cpp +++ b/src/ccutil/ccutil.cpp @@ -11,11 +11,10 @@ // limitations under the License. #include "ccutil.h" +#include "tesserrstream.h" // for tesserr #include "tprintf.h" // for tprintf #include -#include // for std::strrchrA -#include // for std::filesystem namespace tesseract { @@ -33,68 +32,72 @@ CCUtil::CCUtil() CCUtil::~CCUtil() = default; /** - * @brief CCUtil::main_setup - set location of tessdata and name of image + * @brief Finds the path to the tessdata directory. * - * @param argv0 - paths to the directory with language files and config files. - * An actual value of argv0 is used if not nullptr, otherwise TESSDATA_PREFIX is - * used if not nullptr, next try to use compiled in -DTESSDATA_PREFIX. If - * previous is not successful - use current directory. - * @param basename - name of image + * This function determines the location of the tessdata directory based on the + * following order of precedence: + * 1. If `argv0` is provided, use it. + * 2. If `TESSDATA_PREFIX` environment variable is set and the path exists, use + * it. + * 3. On Windows, check for a "tessdata" directory in the executable's directory + * and use it. + * 4. If `TESSDATA_PREFIX` is defined at compile time, use it. + * 5. Otherwise, use the current working directory. + * + * @param argv0 argument to be considered as the data directory path. + * @return The path to the tessdata directory or current directory. */ -void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { - imagebasename = basename; /**< name of image */ - - const char *tessdata_prefix = getenv("TESSDATA_PREFIX"); - - // Ignore TESSDATA_PREFIX if there is no matching filesystem entry. - if (tessdata_prefix != nullptr && !std::filesystem::exists(tessdata_prefix)) { - tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignore it\n", tessdata_prefix); - tessdata_prefix = nullptr; - } - +std::filesystem::path find_data_path(const std::string &argv0) { + // If argv0 is set, always use it even if it is not a valid directory if (!argv0.empty()) { - /* Use tessdata prefix from the command line. */ - datadir = argv0; - } else if (tessdata_prefix) { - /* Use tessdata prefix from the environment. */ - datadir = tessdata_prefix; -#if defined(_WIN32) - } else if (datadir.empty() || !std::filesystem::exists(datadir)) { - /* Look for tessdata in directory of executable. */ - char path[_MAX_PATH]; - DWORD length = GetModuleFileName(nullptr, path, sizeof(path)); - if (length > 0 && length < sizeof(path)) { - char *separator = std::strrchr(path, '\\'); - if (separator != nullptr) { - *separator = '\0'; - std::string subdir = path; - subdir += "/tessdata"; - if (std::filesystem::exists(subdir)) { - datadir = subdir; - } - } + std::filesystem::path path(argv0); + if (!std::filesystem::is_directory(path)) { + tesserr << "Warning (tessdata): '" << argv0 << "' is not a valid directory.\n"; } -#endif /* _WIN32 */ + return path; } - // datadir may still be empty: - if (datadir.empty()) { -#if defined(TESSDATA_PREFIX) - // Use tessdata prefix which was compiled in. - datadir = TESSDATA_PREFIX "/tessdata/"; - // Note that some software (for example conda) patches TESSDATA_PREFIX - // in the binary, so it might be shorter. Recalculate its length. - datadir.resize(std::strlen(datadir.c_str())); -#else - datadir = "./"; -#endif /* TESSDATA_PREFIX */ + // Check environment variable if argv0 is not specified + if (const char *tessdata_prefix = std::getenv("TESSDATA_PREFIX")) { + std::filesystem::path path(tessdata_prefix); + if (std::filesystem::exists(path)) { + return path; + } else { + tprintf("Warning: TESSDATA_PREFIX %s does not exist, ignoring.\n", + tessdata_prefix); + } } - // check for missing directory separator - const char lastchar = datadir.back(); - if (lastchar != '/' && lastchar != '\\') { - datadir += '/'; +#ifdef _WIN32 + // Windows-specific: check for 'tessdata' not existing in the executable + // directory + wchar_t path[MAX_PATH]; + if (DWORD length = GetModuleFileNameW(nullptr, path, MAX_PATH); + length > 0 && length < MAX_PATH) { + std::filesystem::path exe_path(path); + auto tessdata_subdir = exe_path.parent_path() / "tessdata"; + if (std::filesystem::exists(tessdata_subdir)) { + return tessdata_subdir; + } } +#endif + + // Fallback to compile-time or current directory +#ifdef TESSDATA_PREFIX + return std::filesystem::path(TESSDATA_PREFIX) / "tessdata"; +#else + return std::filesystem::current_path(); +#endif } + +/** + * @brief CCUtil::main_setup - set location of tessdata and name of image + * + * @param argv0 - paths to the directory with language files and config files. + */ +void CCUtil::main_setup(const std::string &argv0, const std::string &basename) { + imagebasename = basename; /**< name of image */ + datadir = find_data_path(argv0); +} } // namespace tesseract diff --git a/src/ccutil/ccutil.h b/src/ccutil/ccutil.h index e64199315f..2ffd18fec2 100644 --- a/src/ccutil/ccutil.h +++ b/src/ccutil/ccutil.h @@ -19,6 +19,8 @@ #ifndef TESSERACT_CCUTIL_CCUTIL_H_ #define TESSERACT_CCUTIL_CCUTIL_H_ +#include // for std::filesystem + #ifndef _WIN32 # include # include @@ -53,9 +55,8 @@ class TESS_API CCUtil { ParamsVectors *params() { return ¶ms_; } - - std::string datadir; // dir for data files - std::string imagebasename; // name of image + std::filesystem::path datadir; // dir for data files + std::string imagebasename; // name of image std::string lang; std::string language_data_path_prefix; UNICHARSET unicharset;