![]() Server : Apache/2 System : Linux server-15-235-50-60 5.15.0-164-generic #174-Ubuntu SMP Fri Nov 14 20:25:16 UTC 2025 x86_64 User : gositeme ( 1004) PHP Version : 8.2.29 Disable Function : exec,system,passthru,shell_exec,proc_close,proc_open,dl,popen,show_source,posix_kill,posix_mkfifo,posix_getpwuid,posix_setpgid,posix_setsid,posix_setuid,posix_setgid,posix_seteuid,posix_setegid,posix_uname Directory : /home/gositeme/domains/lavocat.ca/public_html/src/lib/ |
import { prisma } from './prisma';
import * as puppeteer from 'puppeteer';
import { BarreauLawyerData } from './barreau-verification';
import fetch from 'node-fetch';
import * as cheerio from 'cheerio';
interface ScrapedLawyerData {
barNumber: string;
name: string;
email?: string;
phone?: string;
address?: string;
specializations: string[];
regions: string[];
acceptsLegalAid: boolean;
status: 'ACTIVE' | 'SUSPENDED' | 'INACTIVE';
website?: string;
linkedinUrl?: string;
education?: string;
yearsOfExperience?: number;
languages?: string[];
bio?: string;
}
export class BarreauScraper {
private browser: any = null;
private page: any = null;
private isRunning = false;
private progress = {
totalPages: 0,
currentPage: 0,
totalLawyers: 0,
importedLawyers: 0,
errors: 0,
startTime: Date.now()
};
/**
* Initialize the scraper
*/
async initialize() {
if (this.isRunning) {
throw new Error('Scraper is already running');
}
console.log('🚀 Initializing Barreau scraper...');
this.browser = await puppeteer.launch({
headless: true,
args: [
'--no-sandbox',
'--disable-setuid-sandbox',
'--disable-dev-shm-usage',
'--disable-accelerated-2d-canvas',
'--no-first-run',
'--no-zygote',
'--disable-gpu'
]
});
this.page = await this.browser.newPage();
// Set user agent to avoid detection
await this.page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36');
// Set viewport
await this.page.setViewport({ width: 1920, height: 1080 });
console.log('✅ Scraper initialized successfully');
}
/**
* Scrape the entire Barreau directory
*/
async scrapeEntireDirectory() {
if (!this.browser || !this.page) {
throw new Error('Scraper not initialized');
}
this.isRunning = true;
this.progress = {
totalPages: 0,
currentPage: 0,
totalLawyers: 0,
importedLawyers: 0,
errors: 0,
startTime: Date.now()
};
try {
console.log('🔍 Starting Barreau directory scraping...');
// Navigate to the main directory page
await this.page.goto('https://www.barreau.qc.ca/fr/trouver-avocat', {
waitUntil: 'networkidle2',
timeout: 30000
});
// Get total number of pages
const totalPages = await this.getTotalPages();
this.progress.totalPages = totalPages;
console.log(`📊 Found ${totalPages} pages to scrape`);
// Scrape each page
for (let pageNum = 1; pageNum <= totalPages; pageNum++) {
this.progress.currentPage = pageNum;
console.log(`📄 Scraping page ${pageNum}/${totalPages}`);
try {
await this.scrapePage(pageNum);
// Add delay between pages to be respectful
await this.delay(2000);
} catch (error) {
console.error(`❌ Error scraping page ${pageNum}:`, error);
this.progress.errors++;
}
}
console.log('🎉 Directory scraping completed!');
this.printProgressReport();
} catch (error) {
console.error('❌ Fatal error during scraping:', error);
throw error;
} finally {
this.isRunning = false;
}
}
/**
* Get total number of pages in the directory (updated for Barreau HTML)
*/
private async getTotalPages(): Promise<number> {
if (!this.page) throw new Error('Page not initialized');
try {
// Look for the results count (e.g., "175 résultats")
const totalResults = await this.page.evaluate(() => {
const el = document.querySelector('.search-results__count, .results-count, .pagination-info');
if (el) {
const match = el.textContent?.match(/(\d+)[^\d]*résultat/);
if (match) return parseInt(match[1], 10);
}
// Fallback: try to count rows
const rows = document.querySelectorAll('table tbody tr');
return rows.length > 0 ? rows.length : 0;
});
// Barreau seems to show 20 results per page
return totalResults ? Math.ceil(totalResults / 20) : 1;
} catch (error) {
console.warn('⚠️ Could not determine total pages, defaulting to 1');
return 1;
}
}
/**
* Scrape a single page of the directory (updated for Barreau HTML)
*/
private async scrapePage(pageNum: number) {
if (!this.page) throw new Error('Page not initialized');
try {
// Navigate to the specific page
const pageUrl = `https://www.barreau.qc.ca/fr/trouver-avocat?page=${pageNum}`;
await this.page.goto(pageUrl, { waitUntil: 'networkidle2', timeout: 30000 });
// Wait for the results table to load
await this.page.waitForSelector('table tbody tr', { timeout: 10000 });
// Get all lawyer profile links from the table
const lawyerLinks = await this.page.evaluate(() => {
// Each row: first cell contains a link to the profile
const rows = Array.from(document.querySelectorAll('table tbody tr'));
return rows.map(row => {
const link = row.querySelector('td a') as HTMLAnchorElement;
return link ? link.href : null;
}).filter(Boolean);
});
console.log(`📋 Found ${lawyerLinks.length} lawyers on page ${pageNum}`);
// Scrape each lawyer's profile
for (const lawyerUrl of lawyerLinks) {
try {
await this.scrapeLawyerProfile(lawyerUrl);
this.progress.importedLawyers++;
await this.delay(1000);
} catch (error) {
console.error(`❌ Error scraping lawyer profile ${lawyerUrl}:`, error);
this.progress.errors++;
}
}
} catch (error) {
console.error(`❌ Error scraping page ${pageNum}:`, error);
throw error;
}
}
/**
* Scrape an individual lawyer's profile page (updated for Barreau HTML)
*/
private async scrapeLawyerProfile(lawyerUrl: string) {
if (!this.page) throw new Error('Page not initialized');
try {
// Navigate to lawyer's profile page
await this.page.goto(lawyerUrl, { waitUntil: 'networkidle2', timeout: 30000 });
// Wait for the profile card to load
await this.page.waitForSelector('.profile-card, .profile, .result-profile, .container', { timeout: 10000 });
// Extract lawyer data from both tabs
const lawyerData = await this.page.evaluate(() => {
const extractText = (selector: string) => {
const el = document.querySelector(selector);
return el ? el.textContent?.trim() || '' : '';
};
const extractList = (selector: string) => {
return Array.from(document.querySelectorAll(selector)).map(el => el.textContent?.trim()).filter(Boolean);
};
// --- General Tab ---
const name = extractText('h1, .profile__name, .result-profile__name, .profile-card h1');
const employer = extractText('td:contains("Société / employeur"), .profile__employer, .result-profile__employer');
const address = extractText('td:contains("Adresse"), .profile__address, .result-profile__address');
const phone = extractText('td:contains("Téléphone"), .profile__phone, .result-profile__phone');
const fax = extractText('td:contains("Télécopieur"), .profile__fax, .result-profile__fax');
const email = extractText('td:contains("Courriel"), a[href^="mailto:"], .profile__email, .result-profile__email');
const barNumber = extractText('td:contains("Numéro de membre"), .profile__bar-number, .result-profile__bar-number');
const domains = extractList('td:contains("Domaines de droit"), .profile__domains, .result-profile__domains li');
const languages = extractList('td:contains("Langues parlées"), .profile__languages, .result-profile__languages li');
const year = extractText('td:contains("Année de première inscription"), .profile__year, .result-profile__year');
// --- Supplementary Tab ---
// Try to click the supplementary tab if present
const tab = Array.from(document.querySelectorAll('a, button')).find(el => el.textContent?.toLowerCase().includes('informations supplémentaires'));
if (tab) (tab as HTMLElement).click();
// Wait a bit for tab to load (simulate)
// (In puppeteer, we will wait after this evaluate)
const section = extractText('td:contains("Barreau de section"), .profile__section, .result-profile__section');
const mandates = extractList('td:contains("Types de mandats acceptés"), .profile__mandates, .result-profile__mandates li');
// Compose result
return {
name,
employer,
address,
phone,
fax,
email,
barNumber,
domains,
languages,
year,
section,
mandates
};
});
// Wait for supplementary tab content if needed
await this.page.waitForTimeout(500);
// Process and save the lawyer data
if (lawyerData.name && lawyerData.barNumber) {
await this.processLawyerData({
name: lawyerData.name,
barNumber: lawyerData.barNumber,
email: lawyerData.email,
phone: lawyerData.phone,
address: lawyerData.address,
specializations: lawyerData.domains,
regions: [lawyerData.section].filter(Boolean),
acceptsLegalAid: lawyerData.mandates.some((m: string) => m.toLowerCase().includes('aide juridique')),
status: 'ACTIVE', // Default, can be improved
languages: lawyerData.languages,
bio: '',
website: '',
education: '',
yearsOfExperience: lawyerData.year ? (new Date().getFullYear() - parseInt(lawyerData.year)) : undefined
});
console.log(`✅ Imported: ${lawyerData.name} (${lawyerData.barNumber})`);
} else {
console.warn(`⚠️ Skipping lawyer with incomplete data: ${lawyerUrl}`);
}
} catch (error) {
console.error(`❌ Error scraping lawyer profile ${lawyerUrl}:`, error);
throw error;
}
}
/**
* Process and save lawyer data to database
*/
private async processLawyerData(data: ScrapedLawyerData) {
try {
// Check if lawyer already exists
const existingLawyer = await prisma.user.findFirst({
where: {
OR: [
{ barNumber: data.barNumber },
{ email: data.email }
]
}
});
if (existingLawyer) {
// Update existing lawyer with new data
await prisma.user.update({
where: { id: existingLawyer.id },
data: {
name: data.name,
email: data.email || existingLawyer.email,
phone: data.phone,
address: data.address,
websiteUrl: data.website,
bio: data.bio,
specializations: JSON.stringify(data.specializations),
regions: JSON.stringify(data.regions),
language: data.languages ? data.languages.join(', ') : 'en',
education: data.education,
acceptsLegalAid: data.acceptsLegalAid,
barreauStatus: data.status,
verificationStatus: 'VERIFIED_BARREAU',
barreauVerifiedAt: new Date(),
isVerified: true,
isProfilePublic: true
}
});
console.log(`🔄 Updated existing lawyer: ${data.name}`);
} else {
// Create new lawyer profile
const newLawyer = await prisma.user.create({
data: {
name: data.name,
email: data.email || `barreau-${data.barNumber}@placeholder.com`,
role: 'LAWYER',
barNumber: data.barNumber,
phone: data.phone,
address: data.address,
websiteUrl: data.website,
bio: data.bio,
specializations: JSON.stringify(data.specializations),
regions: JSON.stringify(data.regions),
language: data.languages ? data.languages.join(', ') : 'en',
education: data.education,
acceptsLegalAid: data.acceptsLegalAid,
barreauStatus: data.status,
verificationStatus: 'VERIFIED_BARREAU',
barreauVerifiedAt: new Date(),
isVerified: true,
isProfilePublic: true,
// Generate a temporary password
password: await this.generateTemporaryPassword()
}
});
console.log(`🆕 Created new lawyer: ${data.name}`);
}
} catch (error) {
console.error(`❌ Error processing lawyer data for ${data.name}:`, error);
throw error;
}
}
/**
* Generate a temporary password for new lawyers
*/
private async generateTemporaryPassword(): Promise<string> {
const bcrypt = require('bcryptjs');
const tempPassword = `Barreau${Date.now()}`;
return await bcrypt.hash(tempPassword, 12);
}
/**
* Get current scraping progress
*/
getProgress() {
return { ...this.progress };
}
/**
* Print progress report
*/
private printProgressReport() {
const duration = Date.now() - this.progress.startTime;
const minutes = Math.floor(duration / 60000);
const seconds = Math.floor((duration % 60000) / 1000);
console.log('\n📊 SCRAPING PROGRESS REPORT');
console.log('============================');
console.log(`⏱️ Duration: ${minutes}m ${seconds}s`);
console.log(`📄 Pages scraped: ${this.progress.currentPage}/${this.progress.totalPages}`);
console.log(`👥 Lawyers imported: ${this.progress.importedLawyers}`);
console.log(`❌ Errors: ${this.progress.errors}`);
console.log(`📈 Success rate: ${((this.progress.importedLawyers / (this.progress.importedLawyers + this.progress.errors)) * 100).toFixed(1)}%`);
console.log('============================\n');
}
/**
* Stop the scraper
*/
async stop() {
this.isRunning = false;
if (this.browser) {
await this.browser.close();
this.browser = null;
this.page = null;
}
console.log('🛑 Scraper stopped');
}
/**
* Utility function to add delays
*/
private delay(ms: number): Promise<void> {
return new Promise(resolve => setTimeout(resolve, ms));
}
}
// Export singleton instance
export const barreauScraper = new BarreauScraper();
export async function verifyBarreauDirectory(name: string, city: string): Promise<boolean> {
const searchUrl = `https://www.barreau.qc.ca/fr/trouver-un-avocat/?nom=${encodeURIComponent(name)}&ville=${encodeURIComponent(city)}`;
const res = await fetch(searchUrl, {
headers: {
'User-Agent': 'Mozilla/5.0 (compatible; avocat.quebec bot)'
}
});
const html = await res.text();
const $ = cheerio.load(html);
// Adapter le sélecteur selon la structure réelle du site
const found = $('.resultats .nom').filter((i, el) => {
return $(el).text().toLowerCase().includes(name.toLowerCase());
}).length > 0;
return found;
}