| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- (in-package :cl-user)
- (defpackage chatikbot.scraping
- (:use :cl :chatikbot.common)
- (:export :*cookie-jar*
- :*current-dom*
- :*current-status*
- :*current-headers*
- :*current-uri*
- :do-request
- :do-form
- :have-cookies
- :with-request
- :with-chat-cookies
- :scrape-list))
- (in-package :chatikbot.scraping)
- (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
- (defvar *current-dom* nil "Currently active request dom")
- (defvar *current-status* nil "Currently active request status")
- (defvar *current-headers* nil "Currently active request headers")
- (defvar *current-uri* nil "Currently active request uri")
- (defun apply-context (template context)
- (sublis context template))
- (defun do-request (template &optional context)
- (let ((args (apply-context template context)))
- (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
- (defmacro with-request ((template &optional context) &body body)
- `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
- (do-request ,template ,context)
- (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
- ,@body))
- (defmacro with-chat-cookies ((chat-id chat-cookies) &body body)
- (alexandria:with-gensyms (g-chat g-cookies)
- `(let* ((,g-chat ,chat-id)
- (,g-cookies ,chat-cookies)
- (*cookie-jar* (or (gethash ,g-chat ,g-cookies)
- (setf (gethash ,g-chat ,g-cookies)
- (cl-cookie:make-cookie-jar)))))
- ,@body)))
- (defun node-full-url (node attr &optional (uri *current-uri*))
- (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
- (defun do-validate (info &optional (dom *current-dom*))
- (cond
- ((null info) t)
- (t (not (zerop (length (clss:select info dom)))))))
- (defun do-form (form-info &optional context validate)
- (when form-info
- (destructuring-bind (request selector template) form-info
- (with-request (request context)
- (let ((form (clss:select selector *current-dom*)))
- (unless (zerop (length form))
- (let* ((form (elt form 0))
- (action (node-full-url form "action"))
- (method (or (keyify (plump:attribute form "method")) :get))
- (content (append (apply-context template context)
- (loop for input across (clss:select "input,select" form)
- for name = (plump:attribute input "name")
- for value = (plump:attribute input "value")
- when name
- unless (agets template name)
- collect (cons name value))))
- (action-request `(,action :method ,method :content ,content)))
- (with-request (action-request)
- (unless (do-validate validate)
- (error "validate error"))
- (values *current-dom* *current-status* *current-headers* *current-uri*)))))))))
- (defun have-cookies (request)
- (let ((uri (quri:uri (car request))))
- (cl-cookie:cookie-jar-host-cookies
- *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
- :securep (string= (quri:uri-scheme uri) "https"))))
- (defun extract-from-node (node template &optional (uri *current-uri*))
- (destructuring-bind (selector &optional post) template
- (let* ((nodes (clss:select selector node))
- (node (and (not (zerop (length nodes)))
- (elt nodes 0))))
- (when node
- (cond
- ((eq :href post) (node-full-url node "href" uri))
- ((stringp post) (plump:attribute node post))
- (t (unspacify (text-with-cdata node))))))))
- (defun extract-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
- (let ((nodes (clss:select items-selector dom)))
- (when limit
- (setf nodes (subseq nodes 0 (min limit (length nodes)))))
- (loop for node across nodes
- collect (loop for (key . template) in info-templates
- collect (cons key (extract-from-node node template uri))))))
- (defun scrape-list (info &optional context)
- (let* ((scrape (agets info :scrape))
- (request (agets scrape :request))
- (init (agets info :init))
- (validate (agets info :validate)))
- (labels ((return-info ()
- (extract-info (agets scrape :items)
- (agets scrape :info))))
- (unless (have-cookies request)
- (do-form init context validate))
- (with-request (request context)
- (if (do-validate validate)
- (return-info)
- (progn
- (do-form init context validate)
- (with-request (request context)
- (return-info))))))))
|