In this part, we’ll learn how to load and process documents using LangChain. Document processing is crucial for building applications that can analyze and understand content from various sources.
Understanding Document Loaders
Document loaders are LangChain components that help you ingest content from various sources. We’ll focus on PDF processing since it’s commonly needed in business applications.
Setting Up Document Processing
Installing Additional Dependencies
npm install @langchain/document-loaders
npm install pdf-parse mammoth # For PDF and DOCX support
Creating a Document Loader Service
// src/services/documentLoaderService.js
import { PDFLoader } from "@langchain/document-loaders/fs/pdf";
import { TextLoader } from "@langchain/document-loaders/fs/text";
import { RecursiveCharacterTextSplitter } from "@langchain/text-splitters";
export class DocumentLoaderService {
constructor() {
this.textSplitter = new RecursiveCharacterTextSplitter({
chunkSize: 1000,
chunkOverlap: 200,
separators: ["\n\n", "\n", " ", ""],
});
}
async loadPDF(filePath) {
try {
const loader = new PDFLoader(filePath);
const docs = await loader.load();
console.log(`Loaded PDF with ${docs.length} pages`);
return docs;
} catch (error) {
console.error('Error loading PDF:', error);
throw new Error('Failed to load PDF document');
}
}
async splitDocuments(documents) {
try {
const splitDocs = await this.textSplitter.splitDocuments(documents);
console.log(`Split into ${splitDocs.length} chunks`);
return splitDocs;
} catch (error) {
console.error('Error splitting documents:', error);
throw new Error('Failed to split documents');
}
}
}
Advanced Text Splitting Strategies
// src/utils/customTextSplitter.js
export class SmartTextSplitter {
constructor(options = {}) {
this.chunkSize = options.chunkSize || 1000;
this.chunkOverlap = options.chunkOverlap || 200;
}
// Split by semantic boundaries
async splitSemantically(text) {
const sentences = text.split(/[.!?]+/);
const chunks = [];
let currentChunk = "";
for (const sentence of sentences) {
const trimmedSentence = sentence.trim();
if (!trimmedSentence) continue;
if ((currentChunk + trimmedSentence).length > this.chunkSize) {
if (currentChunk) {
chunks.push(currentChunk.trim());
// Keep overlap
const overlapWords = currentChunk.split(' ').slice(-20).join(' ');
currentChunk = overlapWords + ' ' + trimmedSentence;
} else {
currentChunk = trimmedSentence;
}
} else {
currentChunk += (currentChunk ? '. ' : '') + trimmedSentence;
}
}
if (currentChunk) {
chunks.push(currentChunk.trim());
}
return chunks;
}
}
Building a File Upload Endpoint
// src/routes/documents.js
import express from 'express';
import multer from 'multer';
import { DocumentLoaderService } from '../services/documentLoaderService.js';
const router = express.Router();
const documentLoader = new DocumentLoaderService();
// Configure multer for file uploads
const storage = multer.diskStorage({
destination: (req, file, cb) => {
cb(null, 'uploads/');
},
filename: (req, file, cb) => {
const uniqueSuffix = Date.now() + '-' + Math.round(Math.random() * 1E9);
cb(null, uniqueSuffix + '-' + file.originalname);
},
});
const upload = multer({
storage,
fileFilter: (req, file, cb) => {
const allowedTypes = ['application/pdf', 'text/plain'];
if (allowedTypes.includes(file.mimetype)) {
cb(null, true);
} else {
cb(new Error('Invalid file type. Only PDF and TXT files are allowed.'));
}
},
limits: {
fileSize: 10 * 1024 * 1024, // 10MB limit
},
});
router.post('/upload', upload.single('document'), async (req, res) => {
try {
if (!req.file) {
return res.status(400).json({ error: 'No file uploaded' });
}
const { path: filePath, originalname: fileName } = req.file;
// Process the document
const documents = await documentLoader.processUploadedFile(filePath, fileName);
const documentInfo = {
id: generateDocumentId(),
fileName,
uploadedAt: new Date().toISOString(),
chunkCount: documents.length,
};
res.json({
success: true,
document: documentInfo,
message: `Successfully processed ${fileName} into ${documents.length} chunks`,
});
} catch (error) {
console.error('Document upload error:', error);
res.status(500).json({ error: error.message });
}
});
export default router;
Best Practices for Document Processing
- Choose Appropriate Chunk Sizes: Balance between context preservation and processing efficiency
- Maintain Metadata: Keep track of source information and processing details
- Handle Different File Types: Implement specific logic for different document formats
- Error Handling: Gracefully handle corrupted or unsupported files
- Performance: Consider async processing for large documents
In Part 5, we’ll build vector stores and implement semantic search to find relevant information from our processed documents!