1
0

scraping.lisp 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120
  1. (in-package :cl-user)
  2. (defpackage chatikbot.scraping
  3. (:use :cl :chatikbot.common)
  4. (:export :*cookie-jar*
  5. :*current-dom*
  6. :*current-status*
  7. :*current-headers*
  8. :*current-uri*
  9. :do-request
  10. :do-form
  11. :have-cookies
  12. :with-request
  13. :with-chat-cookies
  14. :scrape-list))
  15. (in-package :chatikbot.scraping)
  16. (defvar *cookie-jar* (cl-cookie:make-cookie-jar) "Currently active cookies")
  17. (defvar *current-dom* nil "Currently active request dom")
  18. (defvar *current-status* nil "Currently active request status")
  19. (defvar *current-headers* nil "Currently active request headers")
  20. (defvar *current-uri* nil "Currently active request uri")
  21. (defun apply-context (template context)
  22. (sublis context template))
  23. (defun do-request (template &optional context)
  24. (let ((args (apply-context template context)))
  25. (apply 'xml-request `(,@args :cookie-jar ,*cookie-jar*))))
  26. (defmacro with-request ((template &optional context) &body body)
  27. `(multiple-value-bind (*current-dom* *current-status* *current-headers* *current-uri*)
  28. (do-request ,template ,context)
  29. (declare (ignorable *current-dom* *current-status* *current-headers* *current-uri*))
  30. ,@body))
  31. (defmacro with-chat-cookies ((chat-id chat-cookies) &body body)
  32. (alexandria:with-gensyms (g-chat g-cookies)
  33. `(let* ((,g-chat ,chat-id)
  34. (,g-cookies ,chat-cookies)
  35. (*cookie-jar* (or (gethash ,g-chat ,g-cookies)
  36. (setf (gethash ,g-chat ,g-cookies)
  37. (cl-cookie:make-cookie-jar)))))
  38. ,@body)))
  39. (defun node-full-url (node attr &optional (uri *current-uri*))
  40. (quri:render-uri (quri:merge-uris (quri:uri (plump:attribute node attr)) uri)))
  41. (defun do-validate (info &optional (dom *current-dom*))
  42. (cond
  43. ((null info) t)
  44. (t (not (zerop (length (clss:select info dom)))))))
  45. (defun do-form (form-info &optional context validate)
  46. (when form-info
  47. (destructuring-bind (request selector template) form-info
  48. (with-request (request context)
  49. (let ((form (clss:select selector *current-dom*)))
  50. (unless (zerop (length form))
  51. (let* ((form (elt form 0))
  52. (action (node-full-url form "action"))
  53. (method (or (keyify (plump:attribute form "method")) :get))
  54. (content (append (apply-context template context)
  55. (loop for input across (clss:select "input,select" form)
  56. for name = (plump:attribute input "name")
  57. for value = (plump:attribute input "value")
  58. when name
  59. unless (agets template name)
  60. collect (cons name value))))
  61. (action-request `(,action :method ,method :content ,content)))
  62. (with-request (action-request)
  63. (unless (do-validate validate)
  64. (error "validate error"))
  65. (values *current-dom* *current-status* *current-headers* *current-uri*)))))))))
  66. (defun have-cookies (request)
  67. (let ((uri (quri:uri (car request))))
  68. (cl-cookie:cookie-jar-host-cookies
  69. *cookie-jar* (quri:uri-host uri) (or (quri:uri-path uri) "/")
  70. :securep (string= (quri:uri-scheme uri) "https"))))
  71. (defun extract-data (node &key attr (uri *current-uri*))
  72. (when node
  73. (cond
  74. ((eq :href attr) (node-full-url node "href" uri))
  75. ((stringp attr) (plump:attribute node attr))
  76. (t (unspacify (text-with-cdata node))))))
  77. (defun extract-info (parent template &optional (uri *current-uri*))
  78. (destructuring-bind (selector &key attr multiple) template
  79. (labels ((extract (n)
  80. (extract-data n :attr attr :uri uri)))
  81. (let* ((nodes (clss:select selector parent))
  82. (data (map 'list #'extract nodes)))
  83. (if multiple data (car data))))))
  84. (defun extract-items-info (items-selector info-templates &key limit (dom *current-dom*) (uri *current-uri*))
  85. (let ((nodes (clss:select items-selector dom)))
  86. (when limit
  87. (setf nodes (subseq nodes 0 (min limit (length nodes)))))
  88. (loop for node across nodes
  89. collect (loop for (key . template) in info-templates
  90. collect (cons key (extract-info node template uri))))))
  91. (defun scrape-list (info &optional context)
  92. (let* ((scrape (agets info :scrape))
  93. (request (agets scrape :request))
  94. (init (agets info :init))
  95. (validate (agets info :validate)))
  96. (labels ((return-info ()
  97. (extract-items-info (agets scrape :items)
  98. (agets scrape :info))))
  99. (unless (have-cookies request)
  100. (do-form init context validate))
  101. (with-request (request context)
  102. (if (do-validate validate)
  103. (return-info)
  104. (progn
  105. (do-form init context validate)
  106. (with-request (request context)
  107. (return-info))))))))