Compare commits

4 Commits

Author SHA1 Message Date
af7e55882d wip 2025-03-25 11:24:17 +01:00
3d566a408e add name to contact info 2025-03-25 06:31:19 +01:00
6153b70c1e new way of scraping 2025-03-24 20:24:30 +01:00
47134525b8 save work 2025-03-24 19:00:26 +01:00
6 changed files with 1086 additions and 2912 deletions

View File

@ -1,4 +1,20 @@
rank,Latitude,Longitude,Name of City,Country,2021 Population,2020 Population,Growth,Population Difference,Population Change
8,39.906217,116.3912757,Beijing,China,20896820,20462610,0.0212,434210,grew
9,19.0759899,72.8773928,Mumbai,India,20667656,20411274,0.0126,256382,grew
10,34.6198813,135.490357,Osaka,Japan,19110616,19165340,-0.0029,54724,declined
11,24.8546842,67.0207055,Karachi,Pakistan,16459472,16093786,0.0227,365686,grew
12,29.5647398,106.5478767,Chongqing,China,16382376,15872179,0.0321,510197,grew
13,41.0096334,28.9651646,Istanbul,Turkey,15415197,15190336,0.0148,224861,grew
14,-34.6075682,-58.4370894,Buenos Aires,Argentina,15257673,15153729,0.0069,103944,grew
15,22.5414185,88.3576912,Kolkata,India,14974073,14850066,0.0084,124007,grew
16,-4.3217055,15.3125974,Kinshasa,DR Congo,14970460,14342439,0.0438,628021,grew
17,6.4550575,3.3941795,Lagos,Nigeria,14862111,14368332,0.0344,493779,grew
18,14.5907332,120.9809674,Manila,Philippines,14158573,13923452,0.0169,235121,grew
19,39.0856735,117.1951073,Tianjin,China,13794450,13589078,0.0151,205372,grew
20,23.1301964,113.2592945,Guangzhou,China,13635397,13301532,0.0251,333865,grew
21,-22.9110137,-43.2093727,Rio de Janeiro,Brazil,13544462,13458075,0.0064,86387,grew
22,31.5656822,74.3141829,Lahore,Pakistan,13095166,12642423,0.0358,452743,grew
23,12.9767936,77.590082,Bangalore,India,12764935,12326532,0.0356,438403,grew
24,55.7504461,37.6174943,Moscow,Russia,12593252,12537954,0.0044,55298,grew
25,22.555454,114.0543297,Shenzhen,China,12591696,12356820,0.019,234876,grew
26,13.0836939,80.270186,Chennai,India,11235018,10971108,0.0241,263910,grew

1 rank Latitude Longitude Name of City Country 2021 Population 2020 Population Growth Population Difference Population Change
2 8 39.906217 116.3912757 Beijing China 20896820 20462610 0.0212 434210 grew
3 9 19.0759899 72.8773928 Mumbai India 20667656 20411274 0.0126 256382 grew
4 10 34.6198813 135.490357 Osaka Japan 19110616 19165340 -0.0029 54724 declined
5 11 24.8546842 67.0207055 Karachi Pakistan 16459472 16093786 0.0227 365686 grew
6 12 29.5647398 106.5478767 Chongqing China 16382376 15872179 0.0321 510197 grew
7 13 41.0096334 28.9651646 Istanbul Turkey 15415197 15190336 0.0148 224861 grew
8 14 -34.6075682 -58.4370894 Buenos Aires Argentina 15257673 15153729 0.0069 103944 grew
9 15 22.5414185 88.3576912 Kolkata India 14974073 14850066 0.0084 124007 grew
10 16 -4.3217055 15.3125974 Kinshasa DR Congo 14970460 14342439 0.0438 628021 grew
11 17 6.4550575 3.3941795 Lagos Nigeria 14862111 14368332 0.0344 493779 grew
12 18 14.5907332 120.9809674 Manila Philippines 14158573 13923452 0.0169 235121 grew
13 19 39.0856735 117.1951073 Tianjin China 13794450 13589078 0.0151 205372 grew
14 20 23.1301964 113.2592945 Guangzhou China 13635397 13301532 0.0251 333865 grew
15 21 -22.9110137 -43.2093727 Rio de Janeiro Brazil 13544462 13458075 0.0064 86387 grew
16 22 31.5656822 74.3141829 Lahore Pakistan 13095166 12642423 0.0358 452743 grew
17 23 12.9767936 77.590082 Bangalore India 12764935 12326532 0.0356 438403 grew
18 24 55.7504461 37.6174943 Moscow Russia 12593252 12537954 0.0044 55298 grew
19 25 22.555454 114.0543297 Shenzhen China 12591696 12356820 0.019 234876 grew
20 26 13.0836939 80.270186 Chennai India 11235018 10971108 0.0241 263910 grew

File diff suppressed because it is too large Load Diff

View File

@ -23,115 +23,39 @@ async function visitCityPages(): Promise<void> {
console.log('Connecting to existing Chrome browser...');
const driver = await useChrome();
const driver = await useExistingChrome();
if (!driver) return;
// Visit each city's TripAdvisor page
for (let i = 0; i < cities.length; i++) {
const city = cities[i];
console.log(`[${i + 1}/${cities.length}] Visiting TripAdvisor page for ${city}...`);
let originalWindow;
let cityTopWindow;
let attactionsWindow;
let museumWindow;
try {
const originalWindow = await driver.getWindowHandle();
await driver.get("https://www.tripadvisor.com/");
await WebDriverUtils.wait(driver);
console.log("Logo click")
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
console.log("Exec Search")
if (!await UIActions.execSearch(driver, city)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
console.log("Click See all")
let seeAllUrl = await UIActions.getSeeAllUrl(driver);
if (seeAllUrl.length == 0) {
if (!await UIActions.clickTourismLink(driver)) throw `${city} failed`;
seeAllUrl = await UIActions.getSeeAllUrl(driver);
}
if (seeAllUrl.length == 0) throw `${city} failed`;
await WebDriverUtils.wait();
// open new incognito window
const driver2 = await useChrome();
if (!driver2) throw `${city} failed`;
await WebDriverUtils.wait();
await driver2.get(seeAllUrl);
await WebDriverUtils.wait();
console.log("Switch tab")
let windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow) {
cityTopWindow = handle;
await driver.switchTo().window(handle);
}
}
console.log("Click See all attractions")
if (!await UIActions.getSeeAllAttractionsUrl(driver)) throw `${city} failed`;
await WebDriverUtils.wait();
console.log("Switch tab to Attraction")
windows = await driver.getAllWindowHandles();
// Switch to the newly opened window/tab
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow) {
attactionsWindow = handle;
await driver.switchTo().window(attactionsWindow);
}
}
// click museum
console.log("Click Museum link");
if (!await UIActions.clickMuseumsLink(driver)) throw `${city} failed`;
if (!await UIActions.execSearch(driver, `"${city}" museums`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
let page = 1;
let museumNames: string[] = [];
while (1) {
// get list of museums
console.log("Get list of museums");
const museumElms = await UIActions.getMusiums(driver);
const tmpMuseumNames = await UIActions.getAttractionNames(driver);
await WebDriverUtils.wait(driver);
for (const listItem of museumElms) {
await listItem.click();
await WebDriverUtils.wait(driver);
windows = await driver.getAllWindowHandles();
for (const handle of windows) {
if (handle !== originalWindow && handle !== cityTopWindow && handle !== attactionsWindow) {
museumWindow = handle;
await driver.switchTo().window(museumWindow);
}
}
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
museumWindow && await driver.switchTo().window(museumWindow);
await driver.close();
await WebDriverUtils.wait(driver);
attactionsWindow && await driver.switchTo().window(attactionsWindow);
await WebDriverUtils.wait(driver);
}
museumNames = [...museumNames, ...tmpMuseumNames]
page++;
@ -142,17 +66,41 @@ async function visitCityPages(): Promise<void> {
}
for (const museumIndex in museumNames) {
await UIActions.closeAllTabsExceptFirst(driver);
const museumName = museumNames[museumIndex];
try{
console.log(`reading museum ${museumIndex}/${museumNames.length}`);
if (!await UIActions.gotoHome(driver)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
if (!await UIActions.typeSearch(driver, `${city} ${museumName}`)) throw `${city} failed`;
await WebDriverUtils.wait(driver);
await UIActions.clickFirstAttractionLinkInForm(driver)
await WebDriverUtils.wait(driver);
const { websiteUrl, email } = await UIActions.getWebsiteAndEmail(driver);
console.log(`${websiteUrl} / ${email}`);
saveContactInfoToCSV(city, { name: museumName, websiteUrl: websiteUrl, email: email }, path.join(__dirname, '../data/contact_info.csv'));
await UIActions.simulateClickAt(driver,100,100);
await WebDriverUtils.wait(driver);
}catch(e){
console.error(`failed ${museumName}`)
}
}
if (i < cities.length - 1) {
console.log(`Waiting for 5000 seconds before next city...`);
await WebDriverUtils.wait(); // Wait 5000 seconds before next city
}
} catch (error) {
await UIActions.closeAllTabsExceptFirst(driver);
} catch (error) {
// If the button is not found within the timeout, log and continue to the next city
console.log(`No Museums button found for ${city}. Moving to next city after 5 seconds...`);

View File

@ -1,4 +1,4 @@
import { Builder, By, until, WebDriver, WebElement } from 'selenium-webdriver';
import { Builder, By, until, WebDriver, WebElement, Actions } from 'selenium-webdriver';
import * as chromedriver from 'chromedriver';
import chrome, { ServiceBuilder } from 'selenium-webdriver/chrome';
import * as fs from 'fs';
@ -8,7 +8,7 @@ import { ContactInfo } from './types';
export async function execSearch(driver: WebDriver, city: string): Promise<boolean> {
export async function execSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
@ -18,7 +18,7 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(city);
await searchInput.sendKeys(searchTerm);
// Submit the search (press Enter)
await WebDriverUtils.wait(driver);
@ -31,6 +31,25 @@ export async function execSearch(driver: WebDriver, city: string): Promise<boole
}
}
export async function typeSearch(driver: WebDriver, searchTerm: string): Promise<boolean> {
try {
// Find the search input field
const searchSelector = 'input[name="q"][placeholder="Places to go, things to do, hotels..."]';
await WebDriverUtils.waitForElement(driver, searchSelector, 10000);
console.log("Search box found");
const searchInput = await driver.findElement(By.css(searchSelector));
// Clear any existing text and enter the city name
await searchInput.clear();
await searchInput.sendKeys(searchTerm);
return true;
} catch (e) {
return false;
}
}
export async function clickSeeAll(driver: WebDriver): Promise<boolean> {
try {
const seeAllElement = await driver.wait(
@ -80,7 +99,7 @@ export async function gotoHome(driver: WebDriver): Promise<boolean> {
}
export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
const xpath = `//h3[starts-with(normalize-space(.), 'Things to do')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
@ -98,7 +117,7 @@ export async function clickSeeAllAttractions(driver: WebDriver): Promise<boolean
}
export async function getSeeAllAttractionsUrl(driver: WebDriver): Promise<string | null> {
const xpath = `//h2[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
const xpath = `//h3[starts-with(normalize-space(.), 'Top Attractions in')]/parent::*[1]//a[starts-with(@href, '/Attractions') and .//span[normalize-space(.)='See all']]`;
try {
const anchorElement = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
@ -161,8 +180,51 @@ export async function getMusiums(driver: WebDriver): Promise<WebElement[]> {
}
}
export async function getMuseumsLinks(driver: WebDriver): Promise<string[]> {
const xpath = `//div//section[.//a[starts-with(@href, '/Attraction')] and .//h3]//a[starts-with(@href, '/Attraction') and .//img]`;
try {
const links = await driver.findElements(By.xpath(xpath));
const urls: string[] = [];
for (const link of links) {
const href = await link.getAttribute('href');
if (href) {
urls.push(href);
}
}
return urls;
} catch (error) {
console.warn('Error getting attraction URLs:', error);
return [];
}
}
export async function getAttractionNames(driver: WebDriver): Promise<string[]> {
// XPath to find <h3> inside <a> whose href starts with /Attraction_Review
const xpath = `//a[starts-with(@href, '/Attraction_Review')]/h3`;
try {
const h3Elements = await driver.findElements(By.xpath(xpath));
const names: string[] = [];
for (const h3 of h3Elements) {
const text = await h3.getText();
if (text) {
names.push(text.trim().replace(/^\d+\.\s*/, ''));
}
}
return names;
} catch (error) {
console.warn('Error getting attraction names:', error);
return [];
}
}
export async function getWebsiteAndEmail(driver: WebDriver): Promise<ContactInfo> {
const result: ContactInfo = { websiteUrl: null, email: null };
const result: ContactInfo = { name: null, websiteUrl: null, email: null };
// XPath to find URL (starting with 'http' but not containing 'tripadvisor')
const urlXPath = `//a[starts-with(@href, 'http') and not(contains(@href, 'tripadvisor'))]`;
@ -224,4 +286,42 @@ export async function closeAllTabsExceptFirst(driver: WebDriver): Promise<void>
await driver.switchTo().window(originalHandle);
console.log(`Switched back to original tab: ${originalHandle}`);
}
}
export async function clickFirstAttractionLinkInForm(driver: WebDriver): Promise<boolean> {
const xpath = `//form//a[starts-with(@href, '/Attraction')]`;
try {
// Wait for the link to appear inside a form
const link = await driver.wait(until.elementLocated(By.xpath(xpath)), 5000);
await driver.wait(until.elementIsVisible(link), 5000);
await driver.wait(until.elementIsEnabled(link), 5000);
await link.click();
console.log('Clicked the first attraction link in the form.');
return true;
} catch (error) {
console.warn('Attraction link not found or not clickable.', error);
return false;
}
}
export async function simulateClickAt(driver: WebDriver, x: number, y: number): Promise<void> {
try {
const bodyElement = await driver.findElement(By.css('body'));
await driver
.actions({ bridge: true }) // optional; required in some environments
.move({ x, y, origin: bodyElement }) // move relative to the viewport
.click()
.perform();
console.log(`Clicked at (${x}, ${y})`);
} catch (error) {
console.error('An error occurred:', error);
}
}

View File

@ -1,4 +1,5 @@
export interface ContactInfo {
name: string | null,
websiteUrl: string | null;
email: string | null;
}

View File

@ -15,32 +15,29 @@ export class WebDriverUtils {
* @returns Promise that resolves after the specified time
*/
static async wait(driver?: WebDriver): Promise<void> {
const seconds = Math.floor(Math.random() * 1000) % 3 + 3;
const seconds = Math.floor(Math.random() * 1000) % 3 + 2;
console.log(`Scrolling to bottom for ${seconds} seconds...`);
const endTime = Date.now() + seconds * 1000;
let scrollCounter = 0;
while (Date.now() < endTime) {
/*
try {
if(driver){
if(scrollCounter < 4){
await driver.executeScript(`
window.scrollBy(0, window.innerHeight);
`);
}else{
await driver.executeScript(`
window.scrollTo(0, 0);
`);
}
if (driver) {
await driver.executeScript(`
window.scrollBy(0, 10);
`);
scrollCounter++;
}
} catch (error) {
console.warn('Scroll failed:', error);
}
*/
// Wait a little between scrolls
await new Promise(resolve => setTimeout(resolve, 500));
}
@ -61,7 +58,7 @@ export class WebDriverUtils {
export function saveContactInfoToCSV(city: string, contactInfo: ContactInfo, filePath: string): void {
const headers = 'City,Website URL,Email\n';
const line = `"${city},"${contactInfo.websiteUrl}","${contactInfo.email}"\n`;
const line = `"${city}","${contactInfo.name}","${contactInfo.websiteUrl}","${contactInfo.email}"\n`;
if (!existsSync(filePath)) {
writeFileSync(filePath, headers + line);
@ -111,7 +108,7 @@ export async function disableCookiesInChrome(): Promise<WebDriver | null> {
}
export async function useChrome(): Promise<WebDriver | null> {
export async function useChrome(initialUrl: string = "https://www.tripadvisor.com/"): Promise<WebDriver | null> {
// Set Chrome options
const options = new chrome.Options();
@ -133,7 +130,7 @@ export async function useChrome(): Promise<WebDriver | null> {
.setChromeOptions(options)
.build();
await driver.get('https://www.tripadvisor.com/');
await driver.get(initialUrl);
console.log('Chrome launched with cookies disabled.');