@@ -558,7 +558,6 @@ void Database::scheduleNext(int folder_id, size_t countForFolder)
558
558
if (!countForFolder) {
559
559
emit updateIndexing (folder_id, false );
560
560
emit updateInstalled (folder_id, true );
561
- m_embeddings->save ();
562
561
}
563
562
if (!m_docsToScan.isEmpty ())
564
563
QTimer::singleShot (0 , this , &Database::scanQueue);
@@ -570,7 +569,7 @@ void Database::handleDocumentError(const QString &errorMessage,
570
569
qWarning () << errorMessage << document_id << document_path << error.text ();
571
570
}
572
571
573
- size_t Database::chunkStream (QTextStream &stream, int document_id, const QString &file,
572
+ size_t Database::chunkStream (QTextStream &stream, int folder_id, int document_id, const QString &file,
574
573
const QString &title, const QString &author, const QString &subject, const QString &keywords, int page,
575
574
int maxChunks)
576
575
{
@@ -580,6 +579,8 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
580
579
QList<QString> words;
581
580
int chunks = 0 ;
582
581
582
+ QVector<EmbeddingChunk> chunkList;
583
+
583
584
while (!stream.atEnd ()) {
584
585
QString word;
585
586
stream >> word;
@@ -605,22 +606,62 @@ size_t Database::chunkStream(QTextStream &stream, int document_id, const QString
605
606
qWarning () << " ERROR: Could not insert chunk into db" << q.lastError ();
606
607
}
607
608
609
+ #if 1
610
+ EmbeddingChunk toEmbed;
611
+ toEmbed.folder_id = folder_id;
612
+ toEmbed.chunk_id = chunk_id;
613
+ toEmbed.chunk = chunk;
614
+ chunkList << toEmbed;
615
+ if (chunkList.count () == 100 ) {
616
+ m_embLLM->generateAsyncEmbeddings (chunkList);
617
+ emit updateTotalEmbeddingsToIndex (folder_id, 100 );
618
+ chunkList.clear ();
619
+ }
620
+ #else
608
621
const std::vector<float> result = m_embLLM->generateEmbeddings(chunk);
609
622
if (!m_embeddings->add(result, chunk_id))
610
623
qWarning() << "ERROR: Cannot add point to embeddings index";
624
+ #endif
611
625
612
626
++chunks;
613
627
614
628
words.clear ();
615
629
charCount = 0 ;
616
630
617
631
if (maxChunks > 0 && chunks == maxChunks)
618
- return stream. pos () ;
632
+ break ;
619
633
}
620
634
}
635
+
636
+ if (!chunkList.isEmpty ()) {
637
+ m_embLLM->generateAsyncEmbeddings (chunkList);
638
+ emit updateTotalEmbeddingsToIndex (folder_id, chunkList.count ());
639
+ chunkList.clear ();
640
+ }
641
+
621
642
return stream.pos ();
622
643
}
623
644
645
+ void Database::handleEmbeddingsGenerated (const QVector<EmbeddingResult> &embeddings)
646
+ {
647
+ if (embeddings.isEmpty ())
648
+ return ;
649
+
650
+ int folder_id = 0 ;
651
+ for (auto e : embeddings) {
652
+ folder_id = e.folder_id ;
653
+ if (!m_embeddings->add (e.embedding , e.chunk_id ))
654
+ qWarning () << " ERROR: Cannot add point to embeddings index" ;
655
+ }
656
+ emit updateCurrentEmbeddingsToIndex (folder_id, embeddings.count ());
657
+ m_embeddings->save ();
658
+ }
659
+
660
+ void Database::handleErrorGenerated (int folder_id, const QString &error)
661
+ {
662
+ emit updateError (folder_id, error);
663
+ }
664
+
624
665
void Database::removeEmbeddingsByDocumentId (int document_id)
625
666
{
626
667
QSqlQuery q;
@@ -792,14 +833,13 @@ void Database::scanQueue()
792
833
const QPdfSelection selection = doc.getAllText (pageIndex);
793
834
QString text = selection.text ();
794
835
QTextStream stream (&text);
795
- chunkStream (stream, document_id, info.doc .fileName (),
836
+ chunkStream (stream, info. folder , document_id, info.doc .fileName (),
796
837
doc.metaData (QPdfDocument::MetaDataField::Title).toString (),
797
838
doc.metaData (QPdfDocument::MetaDataField::Author).toString (),
798
839
doc.metaData (QPdfDocument::MetaDataField::Subject).toString (),
799
840
doc.metaData (QPdfDocument::MetaDataField::Keywords).toString (),
800
841
pageIndex + 1
801
842
);
802
- m_embeddings->save ();
803
843
emit subtractCurrentBytesToIndex (info.folder , bytesPerPage);
804
844
if (info.currentPage < doc.pageCount ()) {
805
845
info.currentPage += 1 ;
@@ -828,9 +868,8 @@ void Database::scanQueue()
828
868
#if defined(DEBUG)
829
869
qDebug () << " scanning byteIndex" << byteIndex << " of" << bytes << document_path;
830
870
#endif
831
- int pos = chunkStream (stream, document_id, info.doc .fileName (), QString () /* title*/ , QString () /* author*/ ,
832
- QString () /* subject*/ , QString () /* keywords*/ , -1 /* page*/ , 5 /* maxChunks*/ );
833
- m_embeddings->save ();
871
+ int pos = chunkStream (stream, info.folder , document_id, info.doc .fileName (), QString () /* title*/ , QString () /* author*/ ,
872
+ QString () /* subject*/ , QString () /* keywords*/ , -1 /* page*/ , 100 /* maxChunks*/ );
834
873
file.close ();
835
874
const size_t bytesChunked = pos - byteIndex;
836
875
emit subtractCurrentBytesToIndex (info.folder , bytesChunked);
@@ -892,6 +931,8 @@ void Database::scanDocuments(int folder_id, const QString &folder_path)
892
931
void Database::start ()
893
932
{
894
933
connect (m_watcher, &QFileSystemWatcher::directoryChanged, this , &Database::directoryChanged);
934
+ connect (m_embLLM, &EmbeddingLLM::embeddingsGenerated, this , &Database::handleEmbeddingsGenerated);
935
+ connect (m_embLLM, &EmbeddingLLM::errorGenerated, this , &Database::handleErrorGenerated);
895
936
connect (this , &Database::docsToScanChanged, this , &Database::scanQueue);
896
937
if (!QSqlDatabase::drivers ().contains (" QSQLITE" )) {
897
938
qWarning () << " ERROR: missing sqllite driver" ;
@@ -1081,6 +1122,10 @@ void Database::retrieveFromDB(const QList<QString> &collections, const QString &
1081
1122
QSqlQuery q;
1082
1123
if (m_embeddings->isLoaded ()) {
1083
1124
std::vector<float > result = m_embLLM->generateEmbeddings (text);
1125
+ if (result.empty ()) {
1126
+ qDebug () << " ERROR: generating embeddings returned a null result" ;
1127
+ return ;
1128
+ }
1084
1129
std::vector<qint64> embeddings = m_embeddings->search (result, retrievalSize);
1085
1130
if (!selectChunk (q, collections, embeddings, retrievalSize)) {
1086
1131
qDebug () << " ERROR: selecting chunks:" << q.lastError ().text ();
0 commit comments