Commit 37113c67 by Prasad Gaikwad

Initial commit

parent 4e3958c6
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1
# Set work directory
WORKDIR /app
# Install system dependencies required by Playwright's browsers
# Using the combined command to install dependencies for all browsers
# See: https://playwright.dev/docs/docker#install-system-dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
# --- Playwright dependencies ---
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
# --- Other useful packages ---
curl \
# --- Cleanup ---
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
# Copy the requirements file into the container at /app
COPY requirements.txt setup.py ./
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install -e . --no-deps
# Install Playwright browsers
# This command downloads the browser binaries into the image
RUN playwright install --with-deps
# Copy the rest of the application code into the container at /app
COPY . .
# Expose the port the app runs on
EXPOSE 8001
# Define the command to run the application
# Use 0.0.0.0 to make it accessible from outside the container
CMD ["uvicorn", "gmaps_scraper_server.main_api:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
MIT License
Copyright (c) 2025 conor-is-my-name
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
# Google Maps Scraper API
A FastAPI service for scraping Google Maps data based on search queries. Ideal for n8n users.
Very high performance, watch out for rate limiting!
Use variables to replace URL parameters
scrape-get?query=hotels%20in%2098392&max_places=100&lang=en&headless=true"
If using n8n or other automation, use the /scrape-get endpoint for it to return results
simple install, copy files and run docker compose up -d
Intened to be used with this n8n build:
https://github.com/conor-is-my-name/n8n-autoscaling
## API Endpoints
### POST `/scrape`
Main scraping endpoint (recommended for production)
**Parameters:**
- `query` (required): Search query (e.g., "hotels in 98392")
- `max_places` (optional): Maximum number of results to return
- `lang` (optional, default "en"): Language code for results
- `headless` (optional, default true): Run browser in headless mode
### GET `/scrape-get`
Alternative GET endpoint with same functionality
### GET `/`
Health check endpoint
## Example Requests
### POST Example
```bash
curl -X POST "http://localhost:8001/scrape" \
-H "Content-Type: application/json" \
-d '{
"query": "hotels in 98392",
"max_places": 10,
"lang": "en",
"headless": true
}'
```
### GET Example
```bash
curl "http://localhost:8001/scrape-get?query=hotels%20in%2098392&max_places=10&lang=en&headless=true"
```
or
```bash
curl "http://gmaps_scraper_api_service:8001/scrape-get?query=hotels%20in%2098392&max_places=10&lang=en&headless=true"
```
## Running the Service
### Docker
```bash
docker-compose up --build
```
### Local Development
1. Install dependencies:
```bash
pip install -r requirements.txt
```
2. Run the API:
```bash
uvicorn gmaps_scraper_server.main_api:app --reload
```
The API will be available at `http://localhost:8001`
or for docker:
`http://gmaps_scraper_api_service:8001`
## Notes
- For production use, consider adding authentication
- The scraping process may take several seconds to minutes depending on the number of results
- Results format depends on the underlying scraper implementation
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
version: '3.8'
services:
scraper-api:
build: . # Build the image from the Dockerfile in the current directory
container_name: gmaps_scraper_api_service # Optional: specify a container name
ports:
- "8001:8001" # Map host port 8001 to container port 8001
restart: unless-stopped # Restart policy
volumes:
- .:/app # Mount current directory to /app in container
working_dir: /app # Set working directory to mounted volume
networks:
- shark
# Optional: Add environment variables if needed for configuration
# environment:
# - HEADLESS_MODE=true
cpu_shares: 1024 # Add cpu_shares here if not using Swarm mode
# Create the external network first with:
# docker network create shark
networks:
shark:
external: true
from fastapi import FastAPI, HTTPException, Query
from typing import Optional, List, Dict, Any
import logging
# Import the scraper function (adjust path if necessary)
try:
from gmaps_scraper_server.scraper import scrape_google_maps
except ImportError:
# Handle case where scraper might be in a different structure later
logging.error("Could not import scrape_google_maps from scraper.py")
# Define a dummy function to allow API to start, but fail on call
def scrape_google_maps(*args, **kwargs):
raise ImportError("Scraper function not available.")
# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = FastAPI(
title="Google Maps Scraper API",
description="API to trigger Google Maps scraping based on a query.",
version="0.1.0",
)
@app.post("/scrape", response_model=List[Dict[str, Any]])
async def run_scrape(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.")
):
"""
Triggers the Google Maps scraping process for the given query.
"""
logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless,
lat=lat,
lng=lng
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
@app.get("/scrape-get", response_model=List[Dict[str, Any]])
async def run_scrape_get(
query: str = Query(...),
max_places: Optional[int] = Query(None),
lang: str = Query("en"),
headless: bool = Query(True),
lat: Optional[float] = Query(None),
lng: Optional[float] = Query(None),
max_distance_km: float = Query(30.0, description="Maximum distance in kilometers from (lat, lng)")
):
try:
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless,
lat=lat,
lng=lng,
max_distance_km=max_distance_km
)
return results
except Exception as e:
logging.error(f"Error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.get("/")
async def read_root():
return {"message": "Google Maps Scraper API is running."}
import json
import re
import csv
import os
def safe_get(data, *keys):
"""
......@@ -250,6 +252,7 @@ def get_thumbnail(data):
def extract_place_data(html_content):
"""
High-level function to orchestrate extraction from HTML content.
Saves extracted data into 'extracted_places.csv'.
"""
json_str = extract_initial_json(html_content)
if not json_str:
......@@ -261,7 +264,16 @@ def extract_place_data(html_content):
print("Failed to parse JSON data or find expected structure.")
return None
# Now extract individual fields using the helper functions
print("Parsed data_blob type:", type(data_blob))
if isinstance(data_blob, list):
print("data_blob length:", len(data_blob))
with open("debug_data_blob_per_place.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
with open("debug_full_place_page.json", "w", encoding="utf-8") as f:
json.dump(data_blob, f, indent=2)
# Extract individual fields
place_details = {
"name": get_main_name(data_blob),
"place_id": get_place_id(data_blob),
......@@ -271,15 +283,40 @@ def extract_place_data(html_content):
"reviews_count": get_reviews_count(data_blob),
"categories": get_categories(data_blob),
"website": get_website(data_blob),
"phone": get_phone_number(data_blob), # Needs index verification
"thumbnail": get_thumbnail(data_blob), # Needs index verification
# Add other fields as needed
"phone": get_phone_number(data_blob),
"thumbnail": get_thumbnail(data_blob),
}
# Filter out None values if desired
# Flatten coordinates into latitude and longitude for CSV
coords = place_details.pop("coordinates", None)
if coords:
place_details["latitude"] = coords.get("latitude")
place_details["longitude"] = coords.get("longitude")
# Filter out None values
place_details = {k: v for k, v in place_details.items() if v is not None}
return place_details if place_details else None
# Save to CSV if valid
if place_details:
save_to_csv(place_details, "extracted_places.csv")
return place_details
else:
print("No valid fields found to save.")
return None
def save_to_csv(data: dict, filename: str):
"""Appends a single row dictionary to a CSV file, creating it if it doesn't exist."""
file_exists = os.path.isfile(filename)
with open(filename, mode='a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, fieldnames=data.keys())
if not file_exists:
writer.writeheader()
writer.writerow(data)
print(f"Saved extracted data to {filename}")
# Example usage (for testing):
if __name__ == '__main__':
......
......@@ -33,14 +33,13 @@ async def run_scrape(
"""
logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
# Run the potentially long-running scraping task
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless # Pass headless option from API
headless=headless,
lat=lat,
lng=lng
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
......@@ -54,42 +53,29 @@ async def run_scrape(
@app.get("/scrape-get", response_model=List[Dict[str, Any]])
async def run_scrape_get(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.")
query: str = Query(...),
max_places: Optional[int] = Query(None),
lang: str = Query("en"),
headless: bool = Query(True),
lat: Optional[float] = Query(None),
lng: Optional[float] = Query(None),
max_distance_km: float = Query(30.0, description="Maximum distance in kilometers from (lat, lng)")
):
"""
Triggers the Google Maps scraping process for the given query via GET request.
"""
logging.info(f"Received GET scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
# Run the potentially long-running scraping task
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
results = await scrape_google_maps(
query=query,
max_places=max_places,
lang=lang,
headless=headless # Pass headless option from API
headless=headless,
lat=lat,
lng=lng,
max_distance_km=max_distance_km
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
# Basic root endpoint for health check or info
logging.error(f"Error: {e}", exc_info=True)
raise HTTPException(status_code=500, detail="Internal Server Error")
@app.get("/")
async def read_root():
return {"message": "Google Maps Scraper API is running."}
# Example for running locally (uvicorn main_api:app --reload)
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=8001)
\ No newline at end of file
......@@ -3,6 +3,7 @@ import asyncio # Changed from time
import re
from playwright.async_api import async_playwright, TimeoutError as PlaywrightTimeoutError # Changed to async
from urllib.parse import urlencode
from math import radians, sin, cos, sqrt, atan2
# Import the extraction functions from our helper module
from . import extractor
......@@ -13,32 +14,46 @@ DEFAULT_TIMEOUT = 30000 # 30 seconds for navigation and selectors
SCROLL_PAUSE_TIME = 1.5 # Pause between scrolls
MAX_SCROLL_ATTEMPTS_WITHOUT_NEW_LINKS = 5 # Stop scrolling if no new links found after this many scrolls
# --- Helper Functions ---
def create_search_url(query, lang="en", geo_coordinates=None, zoom=None):
"""Creates a Google Maps search URL."""
params = {'q': query, 'hl': lang}
# Note: geo_coordinates and zoom might require different URL structure (/maps/@lat,lng,zoom)
# For simplicity, starting with basic query search
return BASE_URL + "?" + urlencode(params)
## Calculate distance
# --- Main Scraping Logic ---
async def scrape_google_maps(query, max_places=None, lang="en", headless=True): # Added async
def haversine(lat1, lon1, lat2, lon2):
R = 6371 # Radius of the Earth in kilometers
dlat = radians(lat2 - lat1)
dlon = radians(lon2 - lon1)
a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
c = 2 * atan2(sqrt(a), sqrt(1 - a))
return R * c
# --- Helper Functions ---
def create_search_url(query, lang="en", geo_coordinates=None, zoom=12):
"""
Scrapes Google Maps for places based on a query.
Creates a Google Maps search URL centered around specific geo coordinates.
Args:
query (str): The search query (e.g., "restaurants in New York").
max_places (int, optional): Maximum number of places to scrape. Defaults to None (scrape all found).
lang (str, optional): Language code for Google Maps (e.g., 'en', 'es'). Defaults to "en".
headless (bool, optional): Whether to run the browser in headless mode. Defaults to True.
query (str): Search query (e.g., "campus shoe store").
lang (str): Language code (default: "en").
geo_coordinates (tuple): (latitude, longitude) to center the map.
zoom (int): Zoom level (default: 12).
Returns:
list: A list of dictionaries, each containing details for a scraped place.
Returns an empty list if no places are found or an error occurs.
str: Full Google Maps search URL.
"""
params = {'hl': lang}
encoded_query = query.replace(' ', '+')
if geo_coordinates:
lat, lng = geo_coordinates
return f"{BASE_URL}{encoded_query}/@{lat},{lng},{zoom}z?{urlencode(params)}"
return f"{BASE_URL}{encoded_query}?{urlencode(params)}"
# --- Main Scraping Logic ---
async def scrape_google_maps(query, max_places=None, lang="en", headless=True, lat=None, lng=None, max_distance_km=30):
results = []
place_links = set()
scroll_attempts_no_new = 0
search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
async with async_playwright() as p: # Changed to async
try:
......@@ -50,14 +65,24 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
# Consider setting viewport, locale, timezone if needed
locale=lang,
)
page = await context.new_page() # Added await
if not page:
await browser.close() # Close browser before raising
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
raise Exception("Failed to create a new browser page (context.new_page() returned None).")
# Removed problematic: await page.set_default_timeout(DEFAULT_TIMEOUT)
# Removed associated debug prints
search_url = create_search_url(query, lang)
search_url = create_search_url(query, lang, geo_coordinates=(lat, lng))
print(f"Navigating to search URL: {search_url}")
await page.goto(search_url, wait_until='domcontentloaded') # Added await
await asyncio.sleep(2) # Changed to asyncio.sleep, added await
......@@ -100,6 +125,16 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
print(f"Error: Feed element '{feed_selector}' not found. Maybe no results? Taking screenshot.")
await page.screenshot(path='feed_not_found_screenshot.png') # Added await
await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
return [] # No results or page structure changed
if await page.locator(feed_selector).count() > 0: # Added await
......@@ -178,6 +213,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
await asyncio.sleep(0.5) # Changed to asyncio.sleep, added await
await browser.close() # Added await
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
except PlaywrightTimeoutError:
print(f"Timeout error during scraping process.")
......@@ -189,9 +233,15 @@ async def scrape_google_maps(query, max_places=None, lang="en", headless=True):
# Ensure browser is closed if an error occurred mid-process
if 'browser' in locals() and browser.is_connected(): # Check if browser exists and is connected
await browser.close() # Added await
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
if lat is not None and lng is not None:
results = [
place for place in results
if 'latitude' in place and 'longitude' in place and
haversine(lat, lng, place['latitude'], place['longitude']) <= max_distance_km
]
print(f"\nScraping finished. Found details for {len(results)} places.")
return results
# --- Example Usage ---
# (Example usage block removed as this script is now intended to be imported as a module)
\ No newline at end of file
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3aed9b86-3149-43f0-b80f-332644d2aef4",
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:8000/scrape-get?query=campus%20stores&max_places=5&lang=en&headless=true&lat=19.99113822646553&lng=73.76191319096492&max_distance_km=30\"\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "1700a291-3d4d-465d-8351-d7ebf68d6992",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)\n",
"\n",
"df[\"latitude\"] = df[\"coordinates\"].apply(lambda x: x[\"latitude\"])\n",
"df[\"longitude\"] = df[\"coordinates\"].apply(lambda x: x[\"longitude\"])\n",
"df.drop(columns=\"coordinates\", inplace=True)\n",
"\n",
"df.to_csv(\"results.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2a79a396-b868-4087-aac3-eeea5efe2363",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "ce5b944c-8e50-4544-8524-ef2dddcc1f24",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>link</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91</td>\n",
" <td>Survey No.207, Pheonix Mall pune, Store No, GP...</td>\n",
" <td>4.5</td>\n",
" <td>76.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690432</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.562243</td>\n",
" <td>73.916699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2b99ce8a282a1:0xe1a794aef1ba34ff</td>\n",
" <td>SHOP NO 3, SHAGUN CHOWK, SR.2773/1,PRITAMDAS P...</td>\n",
" <td>4.4</td>\n",
" <td>46.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289148580</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.621968</td>\n",
" <td>73.801811</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>My Campus Store</td>\n",
" <td>0x3bc2bf8d7265c9f7:0xd8359f25ee6e29ca</td>\n",
" <td>2, Shridhar Building, Baner Rd, behind Ancient...</td>\n",
" <td>4.8</td>\n",
" <td>78.0</td>\n",
" <td>['Custom t-shirt store', 'E-commerce service',...</td>\n",
" <td>http://www.mycampusstore.in/</td>\n",
" <td>9637066482</td>\n",
" <td>https://www.google.com/maps/place/My+Campus+St...</td>\n",
" <td>18.557775</td>\n",
" <td>73.799953</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2ebdb93a63e61:0xb82636ae4fe6e866</td>\n",
" <td>Shop No. 2, Sr, No- 41, Katraj-Dehu Rd Bypass,...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>['Shoe store', 'Sportswear store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289018492</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.451480</td>\n",
" <td>73.848531</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x395b93d44507f5e5:0x5aff1ce2a3cb1675</td>\n",
" <td>First floor shop no. 14,333, Domestic Airport ...</td>\n",
" <td>4.9</td>\n",
" <td>35.0</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289925505</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.578019</td>\n",
" <td>73.907157</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91 \n",
"1 Campus Exclusive Store 0x3bc2b99ce8a282a1:0xe1a794aef1ba34ff \n",
"2 My Campus Store 0x3bc2bf8d7265c9f7:0xd8359f25ee6e29ca \n",
"3 Campus Exclusive Store 0x3bc2ebdb93a63e61:0xb82636ae4fe6e866 \n",
"4 Campus Exclusive Store 0x395b93d44507f5e5:0x5aff1ce2a3cb1675 \n",
"\n",
" address rating reviews_count \\\n",
"0 Survey No.207, Pheonix Mall pune, Store No, GP... 4.5 76.0 \n",
"1 SHOP NO 3, SHAGUN CHOWK, SR.2773/1,PRITAMDAS P... 4.4 46.0 \n",
"2 2, Shridhar Building, Baner Rd, behind Ancient... 4.8 78.0 \n",
"3 Shop No. 2, Sr, No- 41, Katraj-Dehu Rd Bypass,... NaN NaN \n",
"4 First floor shop no. 14,333, Domestic Airport ... 4.9 35.0 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store', 'Store'] \n",
"2 ['Custom t-shirt store', 'E-commerce service',... \n",
"3 ['Shoe store', 'Sportswear store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone \\\n",
"0 https://stores.campusshoes.com/maharashtra/pun... 9289690432 \n",
"1 https://stores.campusshoes.com/maharashtra/pun... 9289148580 \n",
"2 http://www.mycampusstore.in/ 9637066482 \n",
"3 https://stores.campusshoes.com/maharashtra/pun... 9289018492 \n",
"4 https://stores.campusshoes.com/maharashtra/pun... 9289925505 \n",
"\n",
" link latitude longitude \n",
"0 https://www.google.com/maps/place/Campus+Exclu... 18.562243 73.916699 \n",
"1 https://www.google.com/maps/place/Campus+Exclu... 18.621968 73.801811 \n",
"2 https://www.google.com/maps/place/My+Campus+St... 18.557775 73.799953 \n",
"3 https://www.google.com/maps/place/Campus+Exclu... 18.451480 73.848531 \n",
"4 https://www.google.com/maps/place/Campus+Exclu... 18.578019 73.907157 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7aba79b-ba84-49c3-a1bd-2b4ac257fc74",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "dee123a6-7a41-4224-bdfe-abb707987ce5",
"metadata": {},
"outputs": [],
"source": [
"import requests\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "3aed9b86-3149-43f0-b80f-332644d2aef4",
"metadata": {},
"outputs": [],
"source": [
"url = \"http://localhost:8000/scrape-get?query=campus%20stores&max_places=5&lang=en&headless=true&lat=19.99113822646553&lng=73.76191319096492&max_distance_km=30\"\n",
"response = requests.get(url)\n",
"data = response.json()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1700a291-3d4d-465d-8351-d7ebf68d6992",
"metadata": {},
"outputs": [],
"source": [
"df = pd.DataFrame(data)\n",
"\n",
"df[\"latitude\"] = df[\"coordinates\"].apply(lambda x: x[\"latitude\"])\n",
"df[\"longitude\"] = df[\"coordinates\"].apply(lambda x: x[\"longitude\"])\n",
"df.drop(columns=\"coordinates\", inplace=True)\n",
"\n",
"df.to_csv(\"results.csv\", index=False)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2a79a396-b868-4087-aac3-eeea5efe2363",
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('results.csv')"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ce5b944c-8e50-4544-8524-ef2dddcc1f24",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>place_id</th>\n",
" <th>address</th>\n",
" <th>rating</th>\n",
" <th>reviews_count</th>\n",
" <th>categories</th>\n",
" <th>website</th>\n",
" <th>phone</th>\n",
" <th>link</th>\n",
" <th>latitude</th>\n",
" <th>longitude</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91</td>\n",
" <td>Survey No.207, Pheonix Mall pune, Store No, GP...</td>\n",
" <td>4.5</td>\n",
" <td>76</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690432</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.562243</td>\n",
" <td>73.916699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be795198099b6e3:0x81def44cd8764dc6</td>\n",
" <td>First Floor, F12, Metro Junction Mall, Shilpha...</td>\n",
" <td>4.8</td>\n",
" <td>112</td>\n",
" <td>['Shoe store', 'Sportswear store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/tha...</td>\n",
" <td>9289677522</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.228908</td>\n",
" <td>73.123019</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7b9c605eea545:0xb793f8af46ce5fac</td>\n",
" <td>Shop No. 1&amp;2, Munshi Estate, Plot No 504, MG R...</td>\n",
" <td>4.7</td>\n",
" <td>100</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289148572</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.173210</td>\n",
" <td>72.955426</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3be7c9013c435c23:0x43967767d741332b</td>\n",
" <td>Selection Ahmed Palace, Plot No. 254 SV Road, ...</td>\n",
" <td>4.4</td>\n",
" <td>125</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/mum...</td>\n",
" <td>9289148575</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>19.060032</td>\n",
" <td>72.836883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Campus Exclusive Store</td>\n",
" <td>0x3bc2bb5812ba0af5:0xaec0dd35c89bc775</td>\n",
" <td>UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H...</td>\n",
" <td>4.8</td>\n",
" <td>254</td>\n",
" <td>['Shoe store', 'Sportswear store', 'Store']</td>\n",
" <td>https://stores.campusshoes.com/maharashtra/pun...</td>\n",
" <td>9289690420</td>\n",
" <td>https://www.google.com/maps/place/Campus+Exclu...</td>\n",
" <td>18.594074</td>\n",
" <td>73.725319</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name place_id \\\n",
"0 Campus Exclusive Store 0x3bc2c1c24c39b3d7:0xcccbaf8c3733cc91 \n",
"1 Campus Exclusive Store 0x3be795198099b6e3:0x81def44cd8764dc6 \n",
"2 Campus Exclusive Store 0x3be7b9c605eea545:0xb793f8af46ce5fac \n",
"3 Campus Exclusive Store 0x3be7c9013c435c23:0x43967767d741332b \n",
"4 Campus Exclusive Store 0x3bc2bb5812ba0af5:0xaec0dd35c89bc775 \n",
"\n",
" address rating reviews_count \\\n",
"0 Survey No.207, Pheonix Mall pune, Store No, GP... 4.5 76 \n",
"1 First Floor, F12, Metro Junction Mall, Shilpha... 4.8 112 \n",
"2 Shop No. 1&2, Munshi Estate, Plot No 504, MG R... 4.7 100 \n",
"3 Selection Ahmed Palace, Plot No. 254 SV Road, ... 4.4 125 \n",
"4 UNIT No-1, FLOOR, 02, GRANT STREET, Phase 1, H... 4.8 254 \n",
"\n",
" categories \\\n",
"0 ['Shoe store', 'Sportswear store', 'Store'] \n",
"1 ['Shoe store', 'Sportswear store'] \n",
"2 ['Shoe store', 'Sportswear store', 'Store'] \n",
"3 ['Shoe store', 'Sportswear store', 'Store'] \n",
"4 ['Shoe store', 'Sportswear store', 'Store'] \n",
"\n",
" website phone \\\n",
"0 https://stores.campusshoes.com/maharashtra/pun... 9289690432 \n",
"1 https://stores.campusshoes.com/maharashtra/tha... 9289677522 \n",
"2 https://stores.campusshoes.com/maharashtra/mum... 9289148572 \n",
"3 https://stores.campusshoes.com/maharashtra/mum... 9289148575 \n",
"4 https://stores.campusshoes.com/maharashtra/pun... 9289690420 \n",
"\n",
" link latitude longitude \n",
"0 https://www.google.com/maps/place/Campus+Exclu... 18.562243 73.916699 \n",
"1 https://www.google.com/maps/place/Campus+Exclu... 19.228908 73.123019 \n",
"2 https://www.google.com/maps/place/Campus+Exclu... 19.173210 72.955426 \n",
"3 https://www.google.com/maps/place/Campus+Exclu... 19.060032 72.836883 \n",
"4 https://www.google.com/maps/place/Campus+Exclu... 18.594074 73.725319 "
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e7aba79b-ba84-49c3-a1bd-2b4ac257fc74",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
playwright
fastapi
uvicorn[standard]
\ No newline at end of file
from setuptools import setup, find_packages
setup(
name="gmaps_scraper_server",
version="0.1",
packages=find_packages(),
install_requires=[
"playwright",
"fastapi",
"uvicorn[standard]"
],
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment