new way of scraping
This commit is contained in:
@ -2798,4 +2798,15 @@ City,Website URL,Email
|
||||
"Bangalore,"null","maya5622@gmail.com"
|
||||
"Bangalore,"https://map-india.org/","hello@map-india.org"
|
||||
"Bangalore,"http://www.gitagged.com/","care@gitagged.com"
|
||||
"Bangalore,"http://map-india.org/contact-us/","hello@map-india.org"
|
||||
"Bangalore,"http://map-india.org/contact-us/","hello@map-india.org"
|
||||
"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru"
|
||||
"Moscow,"http://www.mgomz.ru/","null"
|
||||
"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru"
|
||||
"Moscow,"http://www.kreml.ru/en/","head@kremlin.museum.ru"
|
||||
"Moscow,"http://www.mgomz.ru/","null"
|
||||
"Moscow,"http://www.tretyakovgallery.ru/about/history/zdanie-v-lavrushinskom-pereulke/","tretyakov@tretyakov.ru"
|
||||
"Moscow,"https://vk.com/tsaritsynomuseum","info@tsaritsyno.net"
|
||||
"Moscow,"http://kosmo-museum.ru/","editormmk@gmail.com"
|
||||
"Moscow,"http://www.kreml.ru/","null"
|
||||
"Moscow,"null","null"
|
||||
"Moscow,"http://www.bunker42.com/","zakaz@bunker42.com"
|
||||
|
||||
|
Can't render this file because it contains an unexpected character in line 2 and column 8.
|
96
src/index.ts
96
src/index.ts
@ -30,10 +30,6 @@ async function visitCityPages(): Promise<void> {
|
||||
for (let i = 0; i < cities.length; i++) {
|
||||
const city = cities[i];
|
||||
console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`);
|
||||
let originalWindow;
|
||||
let cityTopWindow;
|
||||
let attactionsWindow;
|
||||
let museumWindow;
|
||||
|
||||
try {
|
||||
|
||||
@ -44,79 +40,19 @@ async function visitCityPages(): Promise<void> {
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
console.log("Exec Search")
|
||||
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
console.log("Click See all")
|
||||
if (!await UIActions.clickSeeAllAttractions(driver)) {
|
||||
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
|
||||
if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`;
|
||||
}
|
||||
await WebDriverUtils.wait();
|
||||
|
||||
console.log("Switch tab")
|
||||
let windows = await driver.getAllWindowHandles();
|
||||
// Switch to the newly opened window/tab
|
||||
for (const handle of windows) {
|
||||
if (handle !== originalWindow) {
|
||||
cityTopWindow = handle;
|
||||
await driver.switchTo().window(handle);
|
||||
}
|
||||
}
|
||||
|
||||
console.log("Click See all attractions")
|
||||
if (!await UIActions.clickSeeAllAttractions(driver)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait();
|
||||
|
||||
console.log("Switch tab to Attraction")
|
||||
windows = await driver.getAllWindowHandles();
|
||||
// Switch to the newly opened window/tab
|
||||
for (const handle of windows) {
|
||||
if (handle !== originalWindow && handle !== cityTopWindow) {
|
||||
attactionsWindow = handle;
|
||||
await driver.switchTo().window(attactionsWindow);
|
||||
}
|
||||
}
|
||||
|
||||
// click museum
|
||||
console.log("Click Museum link");
|
||||
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
|
||||
if (!await UIActions.execSearch(driver, `${city} museum`)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
let page = 1;
|
||||
let museumNames: string[] = [];
|
||||
|
||||
while (1) {
|
||||
|
||||
// get list of museums
|
||||
console.log("Get list of museums");
|
||||
const museumElms = await UIActions.getMusiums(driver);
|
||||
const tmpMuseumNames = await UIActions.getAttractionNames(driver);
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
for (const listItem of museumElms) {
|
||||
|
||||
await listItem.click();
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
windows = await driver.getAllWindowHandles();
|
||||
for (const handle of windows) {
|
||||
if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) {
|
||||
museumWindow = handle;
|
||||
await driver.switchTo().window(museumWindow);
|
||||
}
|
||||
}
|
||||
|
||||
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
|
||||
|
||||
console.log(`${websiteUrl} / ${email}`);
|
||||
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
|
||||
|
||||
museumWindow && await driver.switchTo().window(museumWindow);
|
||||
await driver.close();
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
attactionsWindow && await driver.switchTo().window(attactionsWindow);
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
}
|
||||
museumNames = [...museumNames, ...tmpMuseumNames]
|
||||
|
||||
page++;
|
||||
|
||||
@ -127,7 +63,26 @@ async function visitCityPages(): Promise<void> {
|
||||
|
||||
}
|
||||
|
||||
for (const museumName of museumNames) {
|
||||
|
||||
console.log("back to home");
|
||||
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
if (!await UIActions.typeSearch(driver, `${city} ${museumName}`)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
if (!await UIActions.clickFirstAttractionLinkInForm(driver)) throw `${city} failed`;
|
||||
await WebDriverUtils.wait(driver);
|
||||
|
||||
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
|
||||
|
||||
console.log(`${websiteUrl} / ${email}`);
|
||||
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
|
||||
|
||||
}
|
||||
|
||||
console.log(museumNames);
|
||||
await UIActions.closeAllTabsExceptFirst(driver);
|
||||
|
||||
|
||||
@ -135,9 +90,8 @@ async function visitCityPages(): Promise<void> {
|
||||
console.log(`Waiting for 5000 seconds before next city...`);
|
||||
await WebDriverUtils.wait(); // Wait 5000 seconds before next city
|
||||
}
|
||||
} catch (error) {
|
||||
|
||||
await UIActions.closeAllTabsExceptFirst(driver);
|
||||
} catch (error) {
|
||||
|
||||
// If the button is not found within the timeout, log and continue to the next city
|
||||
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);
|
||||
|
||||
@ -8,7 +8,7 @@ import { ContactInfo } from './types';
|
||||
|
||||
|
||||
|
||||
export async function execSearch(driver: WebDriver, city: string): Promise<boolean> {
|
||||
export async function execSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
|
||||
try {
|
||||
// Find the search input field
|
||||
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
|
||||
@ -18,7 +18,7 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
|
||||
|
||||
// Clear any existing text and enter the city name
|
||||
await searchInput.clear();
|
||||
await searchInput.sendKeys(city);
|
||||
await searchInput.sendKeys(searchTerm);
|
||||
|
||||
// Submit the search (press Enter)
|
||||
await WebDriverUtils.wait(driver);
|
||||
@ -31,6 +31,25 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
|
||||
}
|
||||
}
|
||||
|
||||
export async function typeSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
|
||||
try {
|
||||
// Find the search input field
|
||||
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
|
||||
await WebDriverUtils.waitForElement(driver, searchSelector, 10000);
|
||||
console.log("Search box found");
|
||||
const searchInput = await driver.findElement(By.css(searchSelector));
|
||||
|
||||
// Clear any existing text and enter the city name
|
||||
await searchInput.clear();
|
||||
await searchInput.sendKeys(searchTerm);
|
||||
|
||||
return true;
|
||||
} catch (e) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
|
||||
try {
|
||||
const seeAllElement = await driver.wait(
|
||||
@ -161,6 +180,49 @@ export async function getMusiums(driver: WebDriver): Promise<WebElement[]> {
|
||||
}
|
||||
}
|
||||
|
||||
export async function getMuseumsLinks(driver: WebDriver): Promise<string[]> {
|
||||
const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`;
|
||||
|
||||
try {
|
||||
const links = await driver.findElements(By.xpath(xpath));
|
||||
|
||||
const urls: string[] = [];
|
||||
for (const link of links) {
|
||||
const href = await link.getAttribute('href');
|
||||
if (href) {
|
||||
urls.push(href);
|
||||
}
|
||||
}
|
||||
|
||||
return urls;
|
||||
} catch (error) {
|
||||
console.warn('Error getting attraction URLs:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function getAttractionNames(driver: WebDriver): Promise<string[]> {
|
||||
// XPath to find <h3> inside <a> whose href starts with /Attraction_Review
|
||||
const xpath = `//a[starts-with(@href, '/Attraction_Review')]/h3`;
|
||||
|
||||
try {
|
||||
const h3Elements = await driver.findElements(By.xpath(xpath));
|
||||
|
||||
const names: string[] = [];
|
||||
for (const h3 of h3Elements) {
|
||||
const text = await h3.getText();
|
||||
if (text) {
|
||||
names.push(text.trim().replace(/^\d+\.\s*/, ''));
|
||||
}
|
||||
}
|
||||
|
||||
return names;
|
||||
} catch (error) {
|
||||
console.warn('Error getting attraction names:', error);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> {
|
||||
const result: ContactInfo = { websiteUrl: null, email: null };
|
||||
|
||||
@ -224,4 +286,24 @@ export async function closeAllTabsExceptFirst(driver: WebDriver): Promise<void>
|
||||
|
||||
await driver.switchTo().window(originalHandle);
|
||||
console.log(`Switched back to original tab: ${originalHandle}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function clickFirstAttractionLinkInForm(driver: WebDriver): Promise<boolean> {
|
||||
const xpath = `//form//a[starts-with(@href, '/Attraction')]`;
|
||||
|
||||
try {
|
||||
// Wait for the link to appear inside a form
|
||||
const link = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
|
||||
|
||||
await driver.wait(until.elementIsVisible(link), 5000);
|
||||
await driver.wait(until.elementIsEnabled(link), 5000);
|
||||
|
||||
await link.click();
|
||||
console.log('Clicked the first attraction link in the form.');
|
||||
|
||||
return true;
|
||||
} catch (error) {
|
||||
console.warn('Attraction link not found or not clickable.', error);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
@ -23,6 +23,8 @@ export class WebDriverUtils {
|
||||
let scrollCounter = 0;
|
||||
|
||||
while (Date.now() < endTime) {
|
||||
|
||||
/*
|
||||
try {
|
||||
if (driver) {
|
||||
await driver.executeScript(`
|
||||
@ -34,6 +36,7 @@ export class WebDriverUtils {
|
||||
} catch (error) {
|
||||
console.warn('Scroll failed:', error);
|
||||
}
|
||||
*/
|
||||
|
||||
// Wait a little between scrolls
|
||||
await new Promise(resolve => setTimeout(resolve, 500));
|
||||
@ -105,7 +108,7 @@ export async function disableCookiesInChrome(): Promise<WebDriver | null> {
|
||||
|
||||
}
|
||||
|
||||
export async function useChrome(): Promise<WebDriver | null> {
|
||||
export async function useChrome(initialUrl: string = "https://www.tripadvisor.com/"): Promise<WebDriver | null> {
|
||||
// Set Chrome options
|
||||
const options = new chrome.Options();
|
||||
|
||||
@ -127,7 +130,7 @@ export async function useChrome(): Promise<WebDriver | null> {
|
||||
.setChromeOptions(options)
|
||||
.build();
|
||||
|
||||
await driver.get('https://www.tripadvisor.com/');
|
||||
await driver.get(initialUrl);
|
||||
|
||||
console.log('Chrome launched with cookies disabled.');
|
||||
|
||||
|
||||
Reference in New Issue
Block a user