2 Commits

Author SHA1 Message Date
6153b70c1e new way of scraping 2025-03-24 20:24:30 +01:00
47134525b8 save work 2025-03-24 19:00:26 +01:00
4 changed files with 137 additions and 97 deletions

View File

@ -2799,3 +2799,14 @@ City,Website URL,Email
"Bangalore,"https://map-india.org/","hello@map-india.org" "Bangalore,"https://map-india.org/","hello@map-india.org"
"Bangalore,"http://www.gitagged.com/","care@gitagged.com" "Bangalore,"http://www.gitagged.com/","care@gitagged.com"
"Bangalore,"http://map-india.org/contact-us/","hello@map-india.org" "Bangalore,"http://map-india.org/contact-us/","hello@map-india.org"
"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru"
"Moscow,"http://www.mgomz.ru/","null"
"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru"
"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru"
"Moscow,"http://www.mgomz.ru/","null"
"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru"
"Moscow,"https://vk.com/tsaritsynomuseum","info@tsaritsyno.net"
"Moscow,"http://kosmo-museum.ru/","editormmk@gmail.com"
"Moscow,"http://www.kreml.ru/","null"
"Moscow,"null","null"
"Moscow,"http://www.bunker42.com/","zakaz@bunker42.com"

Can't render this file because it contains an unexpected character in line 2 and column 8.

View File

@ -23,17 +23,13 @@ async function visitCityPages(): Promise<void> {
console.log('Connecting to existing Chrome browser...'); console.log('Connecting to existing Chrome browser...');
const driver = await useChrome(); const driver = await useExistingChrome();
if (!driver) return; if (!driver) return;
// Visit each city's TripAdvisor page // Visit each city's TripAdvisor page
for (let i = 0; i < cities.length; i++) { for (let i = 0; i < cities.length; i++) {
const city = cities[i]; const city = cities[i];
console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`); console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`);
let originalWindow;
let cityTopWindow;
let attactionsWindow;
let museumWindow;
try { try {
@ -44,83 +40,19 @@ async function visitCityPages(): Promise<void> {
await WebDriverUtils.wait(driver); await WebDriverUtils.wait(driver);
console.log("Exec Search") console.log("Exec Search")
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`; if (!await UIActions.execSearch(driver, `${city} museum`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
console.log("Click See all")
let seeAllUrl = await UIActions.getSeeAllUrl(driver);
if (seeAllUrl.length == 0) {
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
seeAllUrl = await UIActions.getSeeAllUrl(driver);
}
if (seeAllUrl.length == 0) throw `${city} failed`;
await WebDriverUtils.wait();
console.log("Switch tab")
let windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow) {
cityTopWindow = handle;
await driver.switchTo().window(handle);
}
}
console.log("Click See all attractions")
if (!await UIActions.getSeeAllAttractionsUrl(driver)) throw `${city} failed`;
await WebDriverUtils.wait();
console.log("Switch tab to Attraction")
windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow) {
attactionsWindow = handle;
await driver.switchTo().window(attactionsWindow);
}
}
// click museum
console.log("Click Museum link");
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver); await WebDriverUtils.wait(driver);
let page = 1; let page = 1;
let museumNames: string[] = [];
while (1) { while (1) {
// get list of museums // get list of museums
console.log("Get list of museums"); console.log("Get list of museums");
const museumElms = await UIActions.getMusiums(driver); const tmpMuseumNames = await UIActions.getAttractionNames(driver);
await WebDriverUtils.wait(driver); await WebDriverUtils.wait(driver);
museumNames = [...museumNames, ...tmpMuseumNames]
for (const listItem of museumElms) {
await listItem.click();
await WebDriverUtils.wait(driver);
windows = await driver.getAllWindowHandles();
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) {
museumWindow = handle;
await driver.switchTo().window(museumWindow);
}
}
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
museumWindow && await driver.switchTo().window(museumWindow);
await driver.close();
await WebDriverUtils.wait(driver);
attactionsWindow && await driver.switchTo().window(attactionsWindow);
await WebDriverUtils.wait(driver);
}
page++; page++;
@ -131,7 +63,26 @@ async function visitCityPages(): Promise<void> {
} }
for (const museumName of museumNames) {
console.log("back to home");
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
if (!await UIActions.typeSearch(driver, `${city} ${museumName}`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
if (!await UIActions.clickFirstAttractionLinkInForm(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
}
console.log(museumNames);
await UIActions.closeAllTabsExceptFirst(driver); await UIActions.closeAllTabsExceptFirst(driver);
@ -139,9 +90,8 @@ async function visitCityPages(): Promise<void> {
console.log(`Waiting for 5000 seconds before next city...`); console.log(`Waiting for 5000 seconds before next city...`);
await WebDriverUtils.wait(); // Wait 5000 seconds before next city await WebDriverUtils.wait(); // Wait 5000 seconds before next city
} }
} catch (error) {
await UIActions.closeAllTabsExceptFirst(driver); } catch (error) {
// If the button is not found within the timeout, log and continue to the next city // If the button is not found within the timeout, log and continue to the next city
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`); console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);

View File

@ -8,7 +8,7 @@ import { ContactInfo } from './types';
export async function execSearch(driver: WebDriver, city: string): Promise<boolean> { export async function execSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try { try {
// Find the search input field // Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]'; const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
@ -18,7 +18,7 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
// Clear any existing text and enter the city name // Clear any existing text and enter the city name
await searchInput.clear(); await searchInput.clear();
await searchInput.sendKeys(city); await searchInput.sendKeys(searchTerm);
// Submit the search (press Enter) // Submit the search (press Enter)
await WebDriverUtils.wait(driver); await WebDriverUtils.wait(driver);
@ -31,6 +31,25 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
} }
} }
export async function typeSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
await WebDriverUtils.waitForElement(driver, searchSelector, 10000);
console.log("Search box found");
const searchInput = await driver.findElement(By.css(searchSelector));
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(searchTerm);
return true;
} catch (e) {
return false;
}
}
export async function clickSeeAll(driver: WebDriver): Promise<boolean> { export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
try { try {
const seeAllElement = await driver.wait( const seeAllElement = await driver.wait(
@ -80,7 +99,7 @@ export async function gotoHome(driver: WebDriver): Promise<boolean> {
} }
export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean> { export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`; const xpath = `//h3[starts-with(normalize-space(.), 'Things to do')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try { try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000); const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
@ -98,7 +117,7 @@ export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean
} }
export async function getSeeAllAttractionsUrl(driver: WebDriver): Promise<string | null> { export async function getSeeAllAttractionsUrl(driver: WebDriver): Promise<string | null> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`; const xpath = `//h3[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try { try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000); const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
@ -161,6 +180,49 @@ export async function getMusiums(driver: WebDriver): Promise<WebElement[]> {
} }
} }
export async function getMuseumsLinks(driver: WebDriver): Promise<string[]> {
const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`;
try {
const links = await driver.findElements(By.xpath(xpath));
const urls: string[] = [];
for (const link of links) {
const href = await link.getAttribute('href');
if (href) {
urls.push(href);
}
}
return urls;
} catch (error) {
console.warn('Error getting attraction URLs:', error);
return [];
}
}
export async function getAttractionNames(driver: WebDriver): Promise<string[]> {
// XPath to find <h3> inside <a> whose href starts with /Attraction_Review
const xpath = `//a[starts-with(@href, '/Attraction_Review')]/h3`;
try {
const h3Elements = await driver.findElements(By.xpath(xpath));
const names: string[] = [];
for (const h3 of h3Elements) {
const text = await h3.getText();
if (text) {
names.push(text.trim().replace(/^\d+\.\s*/, ''));
}
}
return names;
} catch (error) {
console.warn('Error getting attraction names:', error);
return [];
}
}
export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> { export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> {
const result: ContactInfo = { websiteUrl: null, email: null }; const result: ContactInfo = { websiteUrl: null, email: null };
@ -225,3 +287,23 @@ export async function closeAllTabsExceptFirst(driver: WebDriver): Promise<void>
await driver.switchTo().window(originalHandle); await driver.switchTo().window(originalHandle);
console.log(`Switched back to original tab: ${originalHandle}`); console.log(`Switched back to original tab: ${originalHandle}`);
} }
export async function clickFirstAttractionLinkInForm(driver: WebDriver): Promise<boolean> {
const xpath = `//form//a[starts-with(@href, '/Attraction')]`;
try {
// Wait for the link to appear inside a form
const link = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(link), 5000);
await driver.wait(until.elementIsEnabled(link), 5000);
await link.click();
console.log('Clicked the first attraction link in the form.');
return true;
} catch (error) {
console.warn('Attraction link not found or not clickable.', error);
return false;
}
}

View File

@ -23,23 +23,20 @@ export class WebDriverUtils {
let scrollCounter = 0; let scrollCounter = 0;
while (Date.now() < endTime) { while (Date.now() < endTime) {
/*
try { try {
if(driver){ if (driver) {
if(scrollCounter < 4){ await driver.executeScript(`
await driver.executeScript(` window.scrollBy(0, 10);
window.scrollBy(0, window.innerHeight); `);
`);
}else{
await driver.executeScript(`
window.scrollTo(0, 0);
`);
}
scrollCounter++; scrollCounter++;
} }
} catch (error) { } catch (error) {
console.warn('Scroll failed:', error); console.warn('Scroll failed:', error);
} }
*/
// Wait a little between scrolls // Wait a little between scrolls
await new Promise(resolve => setTimeout(resolve, 500)); await new Promise(resolve => setTimeout(resolve, 500));
@ -111,7 +108,7 @@ export async function disableCookiesInChrome(): Promise<WebDriver | null> {
} }
export async function useChrome(): Promise<WebDriver | null> { export async function useChrome(initialUrl: string = "https://www.tripadvisor.com/"): Promise<WebDriver | null> {
// Set Chrome options // Set Chrome options
const options = new chrome.Options(); const options = new chrome.Options();
@ -133,7 +130,7 @@ export async function useChrome(): Promise<WebDriver | null> {
.setChromeOptions(options) .setChromeOptions(options)
.build(); .build();
await driver.get('https://www.tripadvisor.com/'); await driver.get(initialUrl);
console.log('Chrome launched with cookies disabled.'); console.log('Chrome launched with cookies disabled.');