mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-23 01:18:22 +08:00
## Problem
The Go server build pipeline (`build.sh` + CMake + CGO bindings) was
tested on Ubuntu only. On macOS arm64 with Homebrew it fails in five
orthogonal places. None of these require platform-specific code paths —
the same source builds on both Linux and Darwin after these fixes.
## Reproduction (before)
```
$ uname -a
Darwin … 25.4.0 arm64
$ brew install cmake pcre2 simde
$ bash build.sh
…
error: 'simde/x86/sse4.1.h' file not found
error: implicit instantiation of undefined template 'std::basic_istringstream<char>'
error: no matching function for call to 'Join'
…
clang: error: no such file or directory: '/usr/local/lib/libpcre2-8.a'
```
## Fix (5 small, orthogonal changes)
### 1. `internal/cpp/CMakeLists.txt` — find Homebrew + libpcre2-8
portably
- Detect Apple platforms via `if(APPLE)`, call `brew --prefix` once, add
`${HOMEBREW_PREFIX}/include` and `${HOMEBREW_PREFIX}/lib`. No effect on
Linux.
- Replace the literal `libpcre2-8.a` link token (which only the Linux
linker finds in `/usr/local/lib` by default) with
`find_library(PCRE2_LIB NAMES pcre2-8 REQUIRED)`. Works on
`/usr/lib/x86_64-linux-gnu` (Debian/Ubuntu), `/usr/local/lib` (Intel Mac
& legacy Linux), `/opt/homebrew/lib` (Apple Silicon).
### 2. `internal/cpp/wordnet_lemmatizer.cpp` +
`internal/cpp/rag_analyzer.cpp` — explicit `#include <sstream>`
libstdc++ (Linux) pulls `<sstream>` in transitively via `<fstream>`;
libc++ (Apple Clang) doesn't, so the existing `std::istringstream` /
`std::ostringstream` uses fail to compile on macOS. One-line include in
each file.
### 3. `internal/cpp/rag_analyzer.cpp` — `Join` template overload fix
`Join(tokens, start, tokens.size(), delim)` at line 146 passes `size_t`
to an `int` parameter. C++23 strict mode in Apple Clang refuses the
implicit narrowing and reports the 4-arg overload as a substitution
failure, leaving the call ambiguous between the 3-arg and 4-arg
templates. Fix: explicit `static_cast<int>(tokens.size())`. Behaviour
identical on libstdc++ — the narrowing was always intentional.
### 4. `internal/binding/rag_analyzer.go` — split darwin CGO LDFLAGS
The existing `#cgo darwin LDFLAGS: ... /usr/local/lib/libpcre2-8.a` only
matches Intel Macs. Apple Silicon Homebrew installs to `/opt/homebrew`.
Split into `darwin,arm64` and `darwin,amd64` build constraints with the
right absolute path on each.
### 5. `build.sh` — accept Homebrew path in the pcre2 sanity check
The sanity check looked at two Linux paths only and then fell through to
`sudo apt -y install libpcre2-dev` on failure. Added
`/opt/homebrew/lib/libpcre2-8.a`, and on Darwin failure now exits
cleanly with the right `brew install pcre2` hint instead of trying
`apt`.
## Verified
- `bash build.sh` now completes on macOS arm64 (Apple Silicon, brew 4.x,
cmake 4.x, Apple Clang 17, Go 1.25, pcre2 10.x, simde 0.8.x).
- Produced binaries: `bin/server_main`, `bin/admin_server`,
`bin/ragflow_cli`.
- `bin/server_main` boots, connects MySQL, runs migrations, loads the 64
model provider configs cleanly.
- Still builds on Linux — the CMake additions are inside an `if(APPLE)`
guard, the `find_library` call matches Linux paths too, the build.sh
check still tries `apt` when not on Darwin.
## Out of scope
The Go server itself currently fails at runtime when not pointing at
Elasticsearch (`Failed to initialize doc engine: failed to ping
Elasticsearch`), but that's the placeholder Infinity engine documented
in `internal/engine/README.md` — unrelated to this build patchset.
---
Happy to split this into smaller PRs if you'd prefer (one per file). The
five changes are independent.
232 lines
8.2 KiB
C++
232 lines
8.2 KiB
C++
// Copyright(C) 2024 InfiniFlow, Inc. All rights reserved.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// https://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "wordnet_lemmatizer.h"
|
|
#include <fstream>
|
|
#include <filesystem>
|
|
#include <sstream> // std::istringstream — implicit via <fstream> on libstdc++ (Linux), explicit on libc++ (macOS)
|
|
|
|
namespace fs = std::filesystem;
|
|
|
|
static const std::string ADJ = "a";
|
|
static const std::string ADJ_SAT = "s";
|
|
static const std::string ADV = "r";
|
|
static const std::string NOUN = "n";
|
|
static const std::string VERB = "v";
|
|
|
|
WordNetLemmatizer::WordNetLemmatizer(const std::string &wordnet_path) : wordnet_path_(wordnet_path) { Load(); }
|
|
|
|
WordNetLemmatizer::~WordNetLemmatizer() = default;
|
|
|
|
int32_t WordNetLemmatizer::Load() {
|
|
file_map_ = {{ADJ, "adj"}, {ADV, "adv"}, {NOUN, "noun"}, {VERB, "verb"}};
|
|
|
|
MORPHOLOGICAL_SUBSTITUTIONS = {
|
|
{NOUN, {{"s", ""}, {"ses", "s"}, {"ves", "f"}, {"xes", "x"}, {"zes", "z"}, {"ches", "ch"}, {"shes", "sh"}, {"men", "man"}, {"ies", "y"}}},
|
|
{VERB, {{"s", ""}, {"ies", "y"}, {"es", "e"}, {"es", ""}, {"ed", "e"}, {"ed", ""}, {"ing", "e"}, {"ing", ""}}},
|
|
{ADJ, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}},
|
|
{ADV, {}},
|
|
{ADJ_SAT, {{"er", ""}, {"est", ""}, {"er", "e"}, {"est", "e"}}}};
|
|
|
|
POS_LIST = {NOUN, VERB, ADJ, ADV};
|
|
|
|
auto ret = LoadLemmas();
|
|
if (ret != 0) {
|
|
return ret;
|
|
}
|
|
|
|
LoadExceptions();
|
|
// return Status::OK();
|
|
return 0;
|
|
}
|
|
|
|
int32_t WordNetLemmatizer::LoadLemmas() {
|
|
fs::path root(wordnet_path_);
|
|
for (const auto &pair : file_map_) {
|
|
const std::string &pos_abbrev = pair.first;
|
|
const std::string &pos_name = pair.second;
|
|
fs::path index_path(root / ("index." + pos_name));
|
|
|
|
std::ifstream file(index_path.string());
|
|
if (!file.is_open()) {
|
|
return -1;
|
|
// return Status::InvalidAnalyzerFile(fmt::format("Failed to load WordNet lemmatizer, index.{}", pos_name));
|
|
}
|
|
|
|
std::string line;
|
|
|
|
while (std::getline(file, line)) {
|
|
if (line.empty() || line[0] == ' ') {
|
|
continue;
|
|
}
|
|
|
|
std::istringstream stream(line);
|
|
try {
|
|
std::string lemma;
|
|
stream >> lemma;
|
|
|
|
if (lemmas_.find(lemma) == lemmas_.end()) {
|
|
lemmas_[lemma] = std::unordered_set<std::string>();
|
|
}
|
|
lemmas_[lemma].insert(pos_abbrev);
|
|
|
|
if (pos_abbrev == ADJ) {
|
|
if (lemmas_.find(lemma) == lemmas_.end()) {
|
|
lemmas_[lemma] = std::unordered_set<std::string>();
|
|
}
|
|
lemmas_[lemma].insert(ADJ_SAT);
|
|
}
|
|
|
|
} catch (const std::exception &e) {
|
|
return -1;
|
|
// return Status::InvalidAnalyzerFile("Failed to load WordNet lemmatizer lemmas");
|
|
}
|
|
}
|
|
}
|
|
// return Status::OK();
|
|
return 0;
|
|
}
|
|
|
|
void WordNetLemmatizer::LoadExceptions() {
|
|
fs::path root(wordnet_path_);
|
|
for (const auto &pair : file_map_) {
|
|
const std::string &pos_abbrev = pair.first;
|
|
const std::string &pos_name = pair.second;
|
|
fs::path exc_path(root / (pos_name + ".exc"));
|
|
|
|
std::ifstream file(exc_path.string());
|
|
if (!file.is_open()) {
|
|
continue;
|
|
}
|
|
|
|
exceptions_[pos_abbrev] = {};
|
|
|
|
std::string line;
|
|
while (std::getline(file, line)) {
|
|
std::istringstream stream(line);
|
|
std::string inflected_form;
|
|
stream >> inflected_form;
|
|
|
|
std::vector<std::string> base_forms;
|
|
std::string base_form;
|
|
while (stream >> base_form) {
|
|
base_forms.push_back(base_form);
|
|
}
|
|
|
|
exceptions_[pos_abbrev][inflected_form] = base_forms;
|
|
}
|
|
}
|
|
exceptions_[ADJ_SAT] = exceptions_[ADJ];
|
|
}
|
|
|
|
std::vector<std::string> WordNetLemmatizer::CollectSubstitutions(const std::vector<std::string> &forms, const std::string &pos) {
|
|
const auto &substitutions = MORPHOLOGICAL_SUBSTITUTIONS.at(pos);
|
|
std::vector<std::string> results;
|
|
|
|
for (const auto &form : forms) {
|
|
for (const auto &[old_suffix, new_suffix] : substitutions) {
|
|
if (form.size() >= old_suffix.size() && form.compare(form.size() - old_suffix.size(), old_suffix.size(), old_suffix) == 0) {
|
|
results.push_back(form.substr(0, form.size() - old_suffix.size()) + new_suffix);
|
|
}
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
std::vector<std::string> WordNetLemmatizer::CollectSubstitutions(const std::string &form, const std::string &pos) {
|
|
const auto &substitutions = MORPHOLOGICAL_SUBSTITUTIONS.at(pos);
|
|
std::vector<std::string> results;
|
|
|
|
for (const auto &[old_suffix, new_suffix] : substitutions) {
|
|
if (form.size() >= old_suffix.size() && form.compare(form.size() - old_suffix.size(), old_suffix.size(), old_suffix) == 0) {
|
|
results.push_back(form.substr(0, form.size() - old_suffix.size()) + new_suffix);
|
|
}
|
|
}
|
|
return results;
|
|
}
|
|
|
|
std::vector<std::string> WordNetLemmatizer::FilterForms(const std::vector<std::string> &forms, const std::string &pos) {
|
|
std::vector<std::string> result;
|
|
std::unordered_set<std::string> seen;
|
|
|
|
for (const auto &form : forms) {
|
|
if (lemmas_.find(form) != lemmas_.end()) {
|
|
if (lemmas_[form].find(pos) != lemmas_[form].end()) {
|
|
if (seen.find(form) == seen.end()) {
|
|
result.push_back(form);
|
|
seen.insert(form);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
std::vector<std::string> WordNetLemmatizer::Morphy(const std::string &form, const std::string &pos, bool check_exceptions) {
|
|
const auto &pos_exceptions = exceptions_.at(pos);
|
|
|
|
// Check exceptions first
|
|
if (check_exceptions && pos_exceptions.find(form) != pos_exceptions.end()) {
|
|
std::vector<std::string> forms = pos_exceptions.at(form);
|
|
forms.push_back(form);
|
|
return FilterForms(forms, pos);
|
|
}
|
|
|
|
// Apply morphological rules (only ONE level, not recursive like Java)
|
|
// This matches Python NLTK WordNet behavior
|
|
std::vector<std::string> forms = CollectSubstitutions(form, pos);
|
|
std::vector<std::string> combined_forms = forms;
|
|
combined_forms.push_back(form);
|
|
|
|
auto results = FilterForms(combined_forms, pos);
|
|
return results;
|
|
}
|
|
|
|
std::string WordNetLemmatizer::Lemmatize(const std::string &form, const std::string &pos) {
|
|
std::vector<std::string> parts_of_speech;
|
|
if (!pos.empty()) {
|
|
parts_of_speech.push_back(pos);
|
|
} else {
|
|
// Use only NOUN to match Python NLTK default behavior
|
|
parts_of_speech = {NOUN};
|
|
}
|
|
|
|
for (const auto &part : parts_of_speech) {
|
|
auto analyses = Morphy(form, part);
|
|
if (!analyses.empty()) {
|
|
// Python NLTK returns the SHORTEST lemma: min(lemmas, key=len)
|
|
// For "as" -> ["as", "a"] -> returns "a"
|
|
// For "data" -> ["data", "datum"] -> returns "data"
|
|
// For "men" -> ["men", "man"] -> returns "men" (original form preferred when same length)
|
|
std::string shortest = analyses[0];
|
|
for (const auto &analysis : analyses) {
|
|
if (analysis.length() < shortest.length()) {
|
|
shortest = analysis;
|
|
}
|
|
}
|
|
// If original form is in the results and has same length as shortest, prefer original form
|
|
if (shortest != form) {
|
|
for (const auto &analysis : analyses) {
|
|
if (analysis == form && analysis.length() == shortest.length()) {
|
|
shortest = analysis;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
return shortest;
|
|
}
|
|
}
|
|
|
|
return form;
|
|
} |