(in-package :cl-user) (defpackage chatikbot.scraping (:use :cl :chatikbot.common) (:export :*cookie-jar* :*current-dom* :*current-status* :*current-headers* :*current-uri* :do-request :do-form :have-cookies :with-request :with-chat-cookies :scrape-list)) (in-package :chatikbot.scraping) (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies") (defvar *current-dom* nil "Currently active request dom") (defvar *current-status* nil "Currently active request status") (defvar *current-headers* nil "Currently active request headers") (defvar *current-uri* nil "Currently active request uri") (defun apply-context (template context) (sublis context template)) (defun do-request (template &optional context processor) (let ((args (apply-context template context))) (apply (or processor 'xml-request) `(,@args :cookie-jar ,*cookie-jar*)))) (defmacro with-request ((template &optional context processor) &Body body) `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*) (do-request ,template ,context ,processor) (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*)) ,@body)) (defmacro with-chat-cookies ((chat-id chat-cookies) &body body) (alexandria:with-gensyms (g-chat g-cookies) `(let* ((,g-chat ,chat-id) (,g-cookies ,chat-cookies) (*cookie-jar* (or (gethash ,g-chat ,g-cookies) (setf (gethash ,g-chat ,g-cookies) (cl-cookie:make-cookie-jar))))) ,@body))) (defun node-full-url (node attr &optional (uri *current-uri*)) (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri))) (defun do-validate (info &optional (dom *current-dom*)) (cond ((null info) t) (t (not (zerop (length (clss:select info dom))))))) (defun do-form (form-info &optional context validate) (when form-info (destructuring-bind (request selector template) form-info (with-request (request context) (let ((form (clss:select selector *current-dom*))) (unless (zerop (length form)) (let* ((form (elt form 0)) (action (node-full-url form "action")) (method (or (keyify (plump:attribute form "method")) :get)) (content (append (apply-context template context) (loop for input across (clss:select "input,select" form) for name = (plump:attribute input "name") for value = (plump:attribute input "value") when name unless (agets template name) collect (cons name value)))) (action-request `(,action :method ,method :content ,content))) (with-request (action-request) (unless (do-validate validate) (error "validate error")) (values *current-dom* *current-status* *current-headers* *current-uri*))))))))) (defun have-cookies (request) (let ((uri (quri:uri (car request)))) (cl-cookie:cookie-jar-host-cookies *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/") :securep (string= (quri:uri-scheme uri) "https")))) (defun extract-data (node &key attr (uri *current-uri*)) (when node (cond ((eq :href attr) (node-full-url node "href" uri)) ((stringp attr) (plump:attribute node attr)) (t (unspacify (text-with-cdata node)))))) (defun extract-info (parent template &optional (uri *current-uri*)) (destructuring-bind (selector &key child attr multiple) template (labels ((extract (n) (when child (setf n (elt (plump:children n) child))) (extract-data n :attr attr :uri uri))) (let* ((nodes (clss:select selector parent)) (data (map 'list #'extract nodes))) (if multiple data (car data)))))) (defun extract-items-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*)) (let ((nodes (clss:select items-selector dom))) (when limit (setf nodes (subseq nodes 0 (min limit (length nodes))))) (loop for node across nodes collect (loop for (key . template) in info-templates collect (cons key (extract-info node template uri)))))) (defun scrape-list (info &optional context) (let* ((scrape (agets info :scrape)) (request (agets scrape :request)) (processor (or (agets scrape :processor) #'xml-request)) (init (agets info :init)) (validate (agets info :validate))) (labels ((return-info () (extract-items-info (agets scrape :items) (agets scrape :info)))) (unless (have-cookies request) (do-form init context validate)) (with-request (request context processor) (if (do-validate validate) (return-info) (progn (do-form init context validate) (with-request (request context) (return-info))))))))