Skip to content

Commit d14b95f

Browse files
committed
Add Nomic Embed model for atlas with localdocs.
1 parent eadc3b8 commit d14b95f

15 files changed

+502
-74
lines changed

gpt4all-chat/chatllm.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,7 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
156156
if (isModelLoaded() && this->modelInfo() == modelInfo)
157157
return true;
158158

159-
bool isChatGPT = modelInfo.isChatGPT;
159+
bool isChatGPT = modelInfo.isOnline; // right now only chatgpt is offered for online chat models...
160160
QString filePath = modelInfo.dirpath + modelInfo.filename();
161161
QFileInfo fileInfo(filePath);
162162

gpt4all-chat/database.cpp

Lines changed: 53 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -558,7 +558,6 @@ void Database::scheduleNext(int folder_id, size_t countForFolder)
558558
if (!countForFolder) {
559559
emit updateIndexing(folder_id, false);
560560
emit updateInstalled(folder_id, true);
561-
m_embeddings->save();
562561
}
563562
if (!m_docsToScan.isEmpty())
564563
QTimer::singleShot(0, this, &Database::scanQueue);
@@ -570,7 +569,7 @@ void Database::handleDocumentError(const QString &errorMessage,
570569
qWarning() << errorMessage << document_id << document_path << error.text();
571570
}
572571

573-
size_t Database::chunkStream(QTextStream &stream, int document_id, const QString &file,
572+
size_t Database::chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &file,
574573
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
575574
int maxChunks)
576575
{
@@ -580,6 +579,8 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
580579
QList<QString> words;
581580
int chunks = 0;
582581

582+
QVector<EmbeddingChunk> chunkList;
583+
583584
while (!stream.atEnd()) {
584585
QString word;
585586
stream >> word;
@@ -605,22 +606,62 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
605606
qWarning() << "ERROR: Could not insert chunk into db" << q.lastError();
606607
}
607608

609+
#if 1
610+
EmbeddingChunk toEmbed;
611+
toEmbed.folder_id = folder_id;
612+
toEmbed.chunk_id = chunk_id;
613+
toEmbed.chunk = chunk;
614+
chunkList << toEmbed;
615+
if (chunkList.count() == 100) {
616+
m_embLLM->generateAsyncEmbeddings(chunkList);
617+
emit updateTotalEmbeddingsToIndex(folder_id, 100);
618+
chunkList.clear();
619+
}
620+
#else
608621
const std::vector<float> result = m_embLLM->generateEmbeddings(chunk);
609622
if (!m_embeddings->add(result, chunk_id))
610623
qWarning() << "ERROR: Cannot add point to embeddings index";
624+
#endif
611625

612626
++chunks;
613627

614628
words.clear();
615629
charCount = 0;
616630

617631
if (maxChunks > 0 && chunks == maxChunks)
618-
return stream.pos();
632+
break;
619633
}
620634
}
635+
636+
if (!chunkList.isEmpty()) {
637+
m_embLLM->generateAsyncEmbeddings(chunkList);
638+
emit updateTotalEmbeddingsToIndex(folder_id, chunkList.count());
639+
chunkList.clear();
640+
}
641+
621642
return stream.pos();
622643
}
623644

645+
void Database::handleEmbeddingsGenerated(const QVector<EmbeddingResult> &embeddings)
646+
{
647+
if (embeddings.isEmpty())
648+
return;
649+
650+
int folder_id = 0;
651+
for (auto e : embeddings) {
652+
folder_id = e.folder_id;
653+
if (!m_embeddings->add(e.embedding, e.chunk_id))
654+
qWarning() << "ERROR: Cannot add point to embeddings index";
655+
}
656+
emit updateCurrentEmbeddingsToIndex(folder_id, embeddings.count());
657+
m_embeddings->save();
658+
}
659+
660+
void Database::handleErrorGenerated(int folder_id, const QString &error)
661+
{
662+
emit updateError(folder_id, error);
663+
}
664+
624665
void Database::removeEmbeddingsByDocumentId(int document_id)
625666
{
626667
QSqlQuery q;
@@ -792,14 +833,13 @@ void Database::scanQueue()
792833
const QPdfSelection selection = doc.getAllText(pageIndex);
793834
QString text = selection.text();
794835
QTextStream stream(&text);
795-
chunkStream(stream, document_id, info.doc.fileName(),
836+
chunkStream(stream, info.folder, document_id, info.doc.fileName(),
796837
doc.metaData(QPdfDocument::MetaDataField::Title).toString(),
797838
doc.metaData(QPdfDocument::MetaDataField::Author).toString(),
798839
doc.metaData(QPdfDocument::MetaDataField::Subject).toString(),
799840
doc.metaData(QPdfDocument::MetaDataField::Keywords).toString(),
800841
pageIndex + 1
801842
);
802-
m_embeddings->save();
803843
emit subtractCurrentBytesToIndex(info.folder, bytesPerPage);
804844
if (info.currentPage < doc.pageCount()) {
805845
info.currentPage += 1;
@@ -828,9 +868,8 @@ void Database::scanQueue()
828868
#if defined(DEBUG)
829869
qDebug() << "scanning byteIndex" << byteIndex << "of" << bytes << document_path;
830870
#endif
831-
int pos = chunkStream(stream, document_id, info.doc.fileName(), QString() /*title*/, QString() /*author*/,
832-
QString() /*subject*/, QString() /*keywords*/, -1 /*page*/, 5 /*maxChunks*/);
833-
m_embeddings->save();
871+
int pos = chunkStream(stream, info.folder, document_id, info.doc.fileName(), QString() /*title*/, QString() /*author*/,
872+
QString() /*subject*/, QString() /*keywords*/, -1 /*page*/, 100 /*maxChunks*/);
834873
file.close();
835874
const size_t bytesChunked = pos - byteIndex;
836875
emit subtractCurrentBytesToIndex(info.folder, bytesChunked);
@@ -892,6 +931,8 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
892931
void Database::start()
893932
{
894933
connect(m_watcher, &QFileSystemWatcher::directoryChanged, this, &Database::directoryChanged);
934+
connect(m_embLLM, &EmbeddingLLM::embeddingsGenerated, this, &Database::handleEmbeddingsGenerated);
935+
connect(m_embLLM, &EmbeddingLLM::errorGenerated, this, &Database::handleErrorGenerated);
895936
connect(this, &Database::docsToScanChanged, this, &Database::scanQueue);
896937
if (!QSqlDatabase::drivers().contains("QSQLITE")) {
897938
qWarning() << "ERROR: missing sqllite driver";
@@ -1081,6 +1122,10 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
10811122
QSqlQuery q;
10821123
if (m_embeddings->isLoaded()) {
10831124
std::vector<float> result = m_embLLM->generateEmbeddings(text);
1125+
if (result.empty()) {
1126+
qDebug() << "ERROR: generating embeddings returned a null result";
1127+
return;
1128+
}
10841129
std::vector<qint64> embeddings = m_embeddings->search(result, retrievalSize);
10851130
if (!selectChunk(q, collections, embeddings, retrievalSize)) {
10861131
qDebug() << "ERROR: selecting chunks:" << q.lastError().text();

gpt4all-chat/database.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,9 @@
88
#include <QThread>
99
#include <QFileSystemWatcher>
1010

11+
#include "embllm.h"
12+
1113
class Embeddings;
12-
class EmbeddingLLM;
1314
struct DocumentInfo
1415
{
1516
int folder;
@@ -39,10 +40,13 @@ struct CollectionItem {
3940
int folder_id = -1;
4041
bool installed = false;
4142
bool indexing = false;
43+
QString error;
4244
int currentDocsToIndex = 0;
4345
int totalDocsToIndex = 0;
4446
size_t currentBytesToIndex = 0;
4547
size_t totalBytesToIndex = 0;
48+
size_t currentEmbeddingsToIndex = 0;
49+
size_t totalEmbeddingsToIndex = 0;
4650
};
4751
Q_DECLARE_METATYPE(CollectionItem)
4852

@@ -66,11 +70,14 @@ public Q_SLOTS:
6670
void docsToScanChanged();
6771
void updateInstalled(int folder_id, bool b);
6872
void updateIndexing(int folder_id, bool b);
73+
void updateError(int folder_id, const QString &error);
6974
void updateCurrentDocsToIndex(int folder_id, size_t currentDocsToIndex);
7075
void updateTotalDocsToIndex(int folder_id, size_t totalDocsToIndex);
7176
void subtractCurrentBytesToIndex(int folder_id, size_t subtractedBytes);
7277
void updateCurrentBytesToIndex(int folder_id, size_t currentBytesToIndex);
7378
void updateTotalBytesToIndex(int folder_id, size_t totalBytesToIndex);
79+
void updateCurrentEmbeddingsToIndex(int folder_id, size_t currentBytesToIndex);
80+
void updateTotalEmbeddingsToIndex(int folder_id, size_t totalBytesToIndex);
7481
void addCollectionItem(const CollectionItem &item);
7582
void removeFolderById(int folder_id);
7683
void removeCollectionItem(const QString &collectionName);
@@ -82,10 +89,12 @@ private Q_SLOTS:
8289
bool addFolderToWatch(const QString &path);
8390
bool removeFolderFromWatch(const QString &path);
8491
void addCurrentFolders();
92+
void handleEmbeddingsGenerated(const QVector<EmbeddingResult> &embeddings);
93+
void handleErrorGenerated(int folder_id, const QString &error);
8594

8695
private:
8796
void removeFolderInternal(const QString &collection, int folder_id, const QString &path);
88-
size_t chunkStream(QTextStream &stream, int document_id, const QString &file,
97+
size_t chunkStream(QTextStream &stream, int folder_id, int document_id, const QString &file,
8998
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
9099
int maxChunks = -1);
91100
void removeEmbeddingsByDocumentId(int document_id);

gpt4all-chat/embeddings.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,9 @@ bool Embeddings::add(const std::vector<float> &embedding, qint64 label)
129129
}
130130
}
131131

132+
if (embedding.empty())
133+
return false;
134+
132135
try {
133136
m_hnsw->addPoint(embedding.data(), label, false);
134137
} catch (const std::exception &e) {

0 commit comments

Comments
 (0)