Commit 6256ca50 by cbolich

readme update

parent 7de85add
File added
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
.python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by PDM, PEP 582 proposal
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static analysis results
.pytype/
# Cython debug symbols
cython_debug/
# Version control directories
.git/
.hg/
.svn/
# Docker files
Dockerfile
docker-compose.yml
# VS Code settings
.vscode/
\ No newline at end of file
# Use an official Python runtime as a parent image
FROM python:3.10-slim
# Set environment variables
ENV PYTHONDONTWRITEBYTECODE 1
ENV PYTHONUNBUFFERED 1
# Set work directory
WORKDIR /app
# Install system dependencies required by Playwright's browsers
# Using the combined command to install dependencies for all browsers
# See: https://playwright.dev/docs/docker#install-system-dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
# --- Playwright dependencies ---
libnss3 libnspr4 libdbus-1-3 libatk1.0-0 libatk-bridge2.0-0 libcups2 libdrm2 libxkbcommon0 libxcomposite1 libxdamage1 libxfixes3 libxrandr2 libgbm1 libpango-1.0-0 libcairo2 libasound2 \
# --- Other useful packages ---
curl \
# --- Cleanup ---
&& apt-get purge -y --auto-remove -o APT::AutoRemove::RecommendsImportant=false \
&& rm -rf /var/lib/apt/lists/*
# Copy the requirements file into the container at /app
COPY requirements.txt setup.py ./
# Install Python dependencies
RUN pip install --no-cache-dir -r requirements.txt
RUN pip install -e . --no-deps
# Install Playwright browsers
# This command downloads the browser binaries into the image
RUN playwright install --with-deps
# Copy the rest of the application code into the container at /app
COPY . .
# Expose the port the app runs on
EXPOSE 8001
# Define the command to run the application
# Use 0.0.0.0 to make it accessible from outside the container
CMD ["uvicorn", "gmaps_scraper_server.main_api:app", "--host", "0.0.0.0", "--port", "8001"]
\ No newline at end of file
# google-maps-scraper
# Google Maps Scraper API
A FastAPI service for scraping Google Maps data based on search queries.
Very high performance, watch out for rate limiting!
Use variables to replace URL parameters
scrape-get?query=hotels%20in%2098392&max_places=100&lang=en&headless=true"
If using n8n or other automation, use the /scrape-get endpoint for it to return results
simple install, copy files and run docker compose up -d
## API Endpoints
### POST `/scrape`
Main scraping endpoint (recommended for production)
**Parameters:**
- `query` (required): Search query (e.g., "hotels in 98392")
- `max_places` (optional): Maximum number of results to return
- `lang` (optional, default "en"): Language code for results
- `headless` (optional, default true): Run browser in headless mode
### GET `/scrape-get`
Alternative GET endpoint with same functionality
### GET `/`
Health check endpoint
## Example Requests
### POST Example
```bash
curl -X POST "http://localhost:8001/scrape" \
-H "Content-Type: application/json" \
-d '{
"query": "hotels in 98392",
"max_places": 10,
"lang": "en",
"headless": true
}'
```
### GET Example
```bash
curl "http://localhost:8001/scrape-get?query=hotels%20in%2098392&max_places=10&lang=en&headless=true"
```
## Running the Service
### Docker
```bash
docker-compose up --build
```
### Local Development
1. Install dependencies:
```bash
pip install -r requirements.txt
```
2. Run the API:
```bash
uvicorn gmaps_scraper_server.main_api:app --reload
```
The API will be available at `http://localhost:8001`
## Notes
- For production use, consider adding authentication
- The scraping process may take several seconds to complete
- Results format depends on the underlying scraper implementation
\ No newline at end of file
services:
scraper-api:
build: . # Build the image from the Dockerfile in the current directory
container_name: gmaps_scraper_api_service # Optional: specify a container name
ports:
- "8001-8004:8001" # Map host port 8001 to container port 8001
restart: unless-stopped # Restart policy
volumes:
- .:/app # Mount current directory to /app in container
working_dir: /app # Set working directory to mounted volume
# Optional: Add environment variables if needed for configuration
# environment:
# - HEADLESS_MODE=true
networks:
- shark
cpu_shares: 1024 # Add cpu_shares here if not using Swarm mode
# deploy:
# replicas: 4
# resources:
# limits:
# cpus: '1'
# memory: 2G
networks:
shark:
external: true
\ No newline at end of file
# Initialize the gmaps_scraper_server package
\ No newline at end of file
from fastapi import FastAPI, HTTPException, Query
from typing import Optional, List, Dict, Any
import logging
# Import the scraper function (adjust path if necessary)
try:
from gmaps_scraper_server.scraper import scrape_google_maps
except ImportError:
# Handle case where scraper might be in a different structure later
logging.error("Could not import scrape_google_maps from scraper.py")
# Define a dummy function to allow API to start, but fail on call
def scrape_google_maps(*args, **kwargs):
raise ImportError("Scraper function not available.")
# Configure basic logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
app = FastAPI(
title="Google Maps Scraper API",
description="API to trigger Google Maps scraping based on a query.",
version="0.1.0",
)
@app.post("/scrape", response_model=List[Dict[str, Any]])
async def run_scrape(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.")
):
"""
Triggers the Google Maps scraping process for the given query.
"""
logging.info(f"Received scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
# Run the potentially long-running scraping task
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
query=query,
max_places=max_places,
lang=lang,
headless=headless # Pass headless option from API
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
@app.get("/scrape-get", response_model=List[Dict[str, Any]])
async def run_scrape_get(
query: str = Query(..., description="The search query for Google Maps (e.g., 'restaurants in New York')"),
max_places: Optional[int] = Query(None, description="Maximum number of places to scrape. Scrapes all found if None."),
lang: str = Query("en", description="Language code for Google Maps results (e.g., 'en', 'es')."),
headless: bool = Query(True, description="Run the browser in headless mode (no UI). Set to false for debugging locally.")
):
"""
Triggers the Google Maps scraping process for the given query via GET request.
"""
logging.info(f"Received GET scrape request for query: '{query}', max_places: {max_places}, lang: {lang}, headless: {headless}")
try:
# Run the potentially long-running scraping task
# Note: For production, consider running this in a background task queue (e.g., Celery)
# to avoid blocking the API server for long durations.
results = await scrape_google_maps( # Added await
query=query,
max_places=max_places,
lang=lang,
headless=headless # Pass headless option from API
)
logging.info(f"Scraping finished for query: '{query}'. Found {len(results)} results.")
return results
except ImportError as e:
logging.error(f"ImportError during scraping for query '{query}': {e}")
raise HTTPException(status_code=500, detail="Server configuration error: Scraper not available.")
except Exception as e:
logging.error(f"An error occurred during scraping for query '{query}': {e}", exc_info=True)
# Consider more specific error handling based on scraper exceptions
raise HTTPException(status_code=500, detail=f"An internal error occurred during scraping: {str(e)}")
# Basic root endpoint for health check or info
@app.get("/")
async def read_root():
return {"message": "Google Maps Scraper API is running."}
# Example for running locally (uvicorn main_api:app --reload)
# if __name__ == "__main__":
# import uvicorn
# uvicorn.run(app, host="0.0.0.0", port=8001)
\ No newline at end of file
{
"nodes": [
{
"parameters": {
"url": "http://100.95.78.54:8001/scrape-get?query=hotels%20in%2098392&max_places=100&lang=en&headless=true",
"options": {}
},
"type": "n8n-nodes-base.httpRequest",
"typeVersion": 4.2,
"position": [
220,
0
],
"id": "9738622e-6a34-483f-87e4-7f0cda074bff",
"name": "HTTP Request"
}
],
"connections": {},
"pinData": {},
"meta": {
"instanceId": "bfc265a0402eb6543e6cbf43d37210f6fa8cb72736676656a159075d75879e79"
}
}
\ No newline at end of file
playwright
fastapi
uvicorn[standard]
\ No newline at end of file
from setuptools import setup, find_packages
setup(
name="gmaps_scraper_server",
version="0.1",
packages=find_packages(),
install_requires=[
"playwright",
"fastapi",
"uvicorn[standard]"
],
)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment