From 6153b70c1e2e4e03f8e8f84aed4b977ecf86e8d0 Mon Sep 17 00:00:00 2001 From: Ken Yasue Date: Mon, 24 Mar 2025 20:24:30 +0100 Subject: [PATCH] new way of scraping --- data/contact_info.csv | 13 +++++- src/index.ts | 96 +++++++++++-------------------------------- src/lib/UIActions.ts | 88 +++++++++++++++++++++++++++++++++++++-- src/lib/utils.ts | 7 +++- 4 files changed, 127 insertions(+), 77 deletions(-) diff --git a/data/contact_info.csv b/data/contact_info.csv index 37e0406..ef64ad6 100644 --- a/data/contact_info.csv +++ b/data/contact_info.csv @@ -2798,4 +2798,15 @@ City,Website URL,Email "Bangalore,"null","maya5622@gmail.com" "Bangalore,"https://map-india.org/","hello@map-india.org" "Bangalore,"http://www.gitagged.com/","care@gitagged.com" -"Bangalore,"http://map-india.org/contact-us/","hello@map-india.org" \ No newline at end of file +"Bangalore,"http://map-india.org/contact-us/","hello@map-india.org" +"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru" +"Moscow,"http://www.mgomz.ru/","null" +"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru" +"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru" +"Moscow,"http://www.mgomz.ru/","null" +"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru" +"Moscow,"https://vk.com/tsaritsynomuseum","info@tsaritsyno.net" +"Moscow,"http://kosmo-museum.ru/","editormmk@gmail.com" +"Moscow,"http://www.kreml.ru/","null" +"Moscow,"null","null" +"Moscow,"http://www.bunker42.com/","zakaz@bunker42.com" diff --git a/src/index.ts b/src/index.ts index 3ebf5e9..c611375 100644 --- a/src/index.ts +++ b/src/index.ts @@ -30,10 +30,6 @@ async function visitCityPages(): Promise { for (let i = 0; i < cities.length; i++) { const city = cities[i]; console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`); - let originalWindow; - let cityTopWindow; - let attactionsWindow; - let museumWindow; try { @@ -44,79 +40,19 @@ async function visitCityPages(): Promise { await WebDriverUtils.wait(driver); console.log("Exec Search") - if (!await UIActions.execSearch(driver, city)) throw `${city} failed`; - await WebDriverUtils.wait(driver); - - console.log("Click See all") - if (!await UIActions.clickSeeAllAttractions(driver)) { - if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`; - if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`; - } - await WebDriverUtils.wait(); - - console.log("Switch tab") - let windows = await driver.getAllWindowHandles(); - // Switch to the newly opened window/tab - for (const handle of windows) { - if (handle !== originalWindow) { - cityTopWindow = handle; - await driver.switchTo().window(handle); - } - } - - console.log("Click See all attractions") - if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`; - await WebDriverUtils.wait(); - - console.log("Switch tab to Attraction") - windows = await driver.getAllWindowHandles(); - // Switch to the newly opened window/tab - for (const handle of windows) { - if (handle !== originalWindow && handle !== cityTopWindow) { - attactionsWindow = handle; - await driver.switchTo().window(attactionsWindow); - } - } - - // click museum - console.log("Click Museum link"); - if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`; + if (!await UIActions.execSearch(driver, `${city} museum`)) throw `${city} failed`; await WebDriverUtils.wait(driver); let page = 1; + let museumNames: string[] = []; + while (1) { // get list of museums console.log("Get list of museums"); - const museumElms = await UIActions.getMusiums(driver); + const tmpMuseumNames = await UIActions.getAttractionNames(driver); await WebDriverUtils.wait(driver); - - for (const listItem of museumElms) { - - await listItem.click(); - await WebDriverUtils.wait(driver); - - windows = await driver.getAllWindowHandles(); - for (const handle of windows) { - if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) { - museumWindow = handle; - await driver.switchTo().window(museumWindow); - } - } - - const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver); - - console.log(`${websiteUrl} / ${email}`); - saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv')); - - museumWindow && await driver.switchTo().window(museumWindow); - await driver.close(); - await WebDriverUtils.wait(driver); - - attactionsWindow && await driver.switchTo().window(attactionsWindow); - await WebDriverUtils.wait(driver); - - } + museumNames = [...museumNames, ...tmpMuseumNames] page++; @@ -127,7 +63,26 @@ async function visitCityPages(): Promise { } + for (const museumName of museumNames) { + console.log("back to home"); + if (!await UIActions.gotoHome(driver)) throw `${city} failed`; + await WebDriverUtils.wait(driver); + + if (!await UIActions.typeSearch(driver, `${city} ${museumName}`)) throw `${city} failed`; + await WebDriverUtils.wait(driver); + + if (!await UIActions.clickFirstAttractionLinkInForm(driver)) throw `${city} failed`; + await WebDriverUtils.wait(driver); + + const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver); + + console.log(`${websiteUrl} / ${email}`); + saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv')); + + } + + console.log(museumNames); await UIActions.closeAllTabsExceptFirst(driver); @@ -135,9 +90,8 @@ async function visitCityPages(): Promise { console.log(`Waiting for 5000 seconds before next city...`); await WebDriverUtils.wait(); // Wait 5000 seconds before next city } - } catch (error) { - await UIActions.closeAllTabsExceptFirst(driver); + } catch (error) { // If the button is not found within the timeout, log and continue to the next city console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`); diff --git a/src/lib/UIActions.ts b/src/lib/UIActions.ts index 69f756a..61860d2 100644 --- a/src/lib/UIActions.ts +++ b/src/lib/UIActions.ts @@ -8,7 +8,7 @@ import { ContactInfo } from './types'; -export async function execSearch(driver: WebDriver, city: string): Promise { +export async function execSearch(driver: WebDriver, searchTerm: string): Promise { try { // Find the search input field const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]'; @@ -18,7 +18,7 @@ export async function execSearch(driver: WebDriver, city: string): Promise { + try { + // Find the search input field + const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]'; + await WebDriverUtils.waitForElement(driver, searchSelector, 10000); + console.log("Search box found"); + const searchInput = await driver.findElement(By.css(searchSelector)); + + // Clear any existing text and enter the city name + await searchInput.clear(); + await searchInput.sendKeys(searchTerm); + + return true; + } catch (e) { + return false; + } +} + + export async function clickSeeAll(driver: WebDriver): Promise { try { const seeAllElement = await driver.wait( @@ -161,6 +180,49 @@ export async function getMusiums(driver: WebDriver): Promise { } } +export async function getMuseumsLinks(driver: WebDriver): Promise { + const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`; + + try { + const links = await driver.findElements(By.xpath(xpath)); + + const urls: string[] = []; + for (const link of links) { + const href = await link.getAttribute('href'); + if (href) { + urls.push(href); + } + } + + return urls; + } catch (error) { + console.warn('Error getting attraction URLs:', error); + return []; + } +} + +export async function getAttractionNames(driver: WebDriver): Promise { + // XPath to find

inside whose href starts with /Attraction_Review + const xpath = `//a[starts-with(@href, '/Attraction_Review')]/h3`; + + try { + const h3Elements = await driver.findElements(By.xpath(xpath)); + + const names: string[] = []; + for (const h3 of h3Elements) { + const text = await h3.getText(); + if (text) { + names.push(text.trim().replace(/^\d+\.\s*/, '')); + } + } + + return names; + } catch (error) { + console.warn('Error getting attraction names:', error); + return []; + } +} + export async function getWebsiteAndEmail(driver: WebDriver): Promise { const result: ContactInfo = { websiteUrl: null, email: null }; @@ -224,4 +286,24 @@ export async function closeAllTabsExceptFirst(driver: WebDriver): Promise await driver.switchTo().window(originalHandle); console.log(`Switched back to original tab: ${originalHandle}`); -} \ No newline at end of file +} + +export async function clickFirstAttractionLinkInForm(driver: WebDriver): Promise { + const xpath = `//form//a[starts-with(@href, '/Attraction')]`; + + try { + // Wait for the link to appear inside a form + const link = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000); + + await driver.wait(until.elementIsVisible(link), 5000); + await driver.wait(until.elementIsEnabled(link), 5000); + + await link.click(); + console.log('Clicked the first attraction link in the form.'); + + return true; + } catch (error) { + console.warn('Attraction link not found or not clickable.', error); + return false; + } +} diff --git a/src/lib/utils.ts b/src/lib/utils.ts index b506846..bdf6ca9 100644 --- a/src/lib/utils.ts +++ b/src/lib/utils.ts @@ -23,6 +23,8 @@ export class WebDriverUtils { let scrollCounter = 0; while (Date.now() < endTime) { + + /* try { if (driver) { await driver.executeScript(` @@ -34,6 +36,7 @@ export class WebDriverUtils { } catch (error) { console.warn('Scroll failed:', error); } + */ // Wait a little between scrolls await new Promise(resolve => setTimeout(resolve, 500)); @@ -105,7 +108,7 @@ export async function disableCookiesInChrome(): Promise { } -export async function useChrome(): Promise { +export async function useChrome(initialUrl: string = "https://www.tripadvisor.com/"): Promise { // Set Chrome options const options = new chrome.Options(); @@ -127,7 +130,7 @@ export async function useChrome(): Promise { .setChromeOptions(options) .build(); - await driver.get('https://www.tripadvisor.com/'); + await driver.get(initialUrl); console.log('Chrome launched with cookies disabled.');