Setup feeds, request, and filter them
This commit is contained in:
parent
657975324a
commit
70e78bcf42
@ -4,6 +4,8 @@ require_once __DIR__ . '/../vendor/autoload.php';
|
||||
use Doctrine\ORM\EntityManager;
|
||||
use Doctrine\ORM\Tools\Console\ConsoleRunner;
|
||||
use Doctrine\ORM\Tools\Console\EntityManagerProvider\SingleManagerProvider;
|
||||
use Lewisdale\App\Models\Repositories\FeedRepository;
|
||||
use Lewisdale\App\Tools\Console\TestFeed;
|
||||
|
||||
$dotenv = Dotenv\Dotenv::createImmutable([__DIR__, __DIR__ . "/.."]);
|
||||
$dotenv->load();
|
||||
@ -12,6 +14,11 @@ $dotenv->load();
|
||||
require_once __DIR__ . '/../src/dependencies.php';
|
||||
global $container;
|
||||
|
||||
$commands = [
|
||||
$container->get(TestFeed::class),
|
||||
];
|
||||
|
||||
ConsoleRunner::run(
|
||||
new SingleManagerProvider($container->get(EntityManager::class))
|
||||
new SingleManagerProvider($container->get(EntityManager::class)),
|
||||
$commands
|
||||
);
|
@ -20,7 +20,10 @@
|
||||
"symfony/dom-crawler": "^6.3",
|
||||
"symfony/css-selector": "^6.3",
|
||||
"symfony/http-client": "^6.3",
|
||||
"league/uri": "^6.8"
|
||||
"league/uri": "^6.8",
|
||||
"ext-simplexml": "*",
|
||||
"ext-curl": "*",
|
||||
"ext-dom": "*"
|
||||
},
|
||||
"require-dev": {
|
||||
"phpunit/phpunit": "^10.0"
|
||||
|
@ -26,7 +26,7 @@ class FeedController
|
||||
return $this->view->render($response, 'index.twig.html', ['feeds' => $feeds]);
|
||||
}
|
||||
|
||||
public function create(ServerRequestInterface $request, ResponseInterface $response)
|
||||
public function create(ServerRequestInterface $request, ResponseInterface $response): ResponseInterface
|
||||
{
|
||||
$this->logger->info("FeedController::create() called");
|
||||
$body = $request->getParsedBody();
|
||||
|
@ -4,9 +4,16 @@ declare(strict_types=1);
|
||||
|
||||
namespace Lewisdale\App\Models\Data;
|
||||
|
||||
use Doctrine\Common\Collections\Collection;
|
||||
use Doctrine\DBAL\Types\Types;
|
||||
use Doctrine\ORM\Mapping as ORM;
|
||||
use DOMXPath;
|
||||
use Lewisdale\App\Models\Types\XMLElement;
|
||||
use Lewisdale\App\Requests\Robots;
|
||||
use Ramsey\Uuid\Doctrine\UuidGenerator;
|
||||
use Ramsey\Uuid\UuidInterface;
|
||||
use SimpleXMLElement;
|
||||
use function Symfony\Component\String\s;
|
||||
|
||||
#[ORM\Entity]
|
||||
#[ORM\Table(name: 'feeds')]
|
||||
@ -24,4 +31,58 @@ class Feed
|
||||
|
||||
#[ORM\Column(type: 'string')]
|
||||
public string $title;
|
||||
|
||||
|
||||
/**
|
||||
* @var Collection<FeedFilter>
|
||||
*/
|
||||
#[ORM\OneToMany(mappedBy: 'feed', targetEntity: FeedFilter::class, cascade: ['persist', 'remove'])]
|
||||
public Collection $feedFilters;
|
||||
|
||||
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
|
||||
public SimpleXMLElement|null $remoteFeed;
|
||||
|
||||
#[ORM\Column(type: XMLElement::NAME, nullable: true)]
|
||||
public SimpleXMLElement|null $filteredFeed;
|
||||
|
||||
public function fetch(): void
|
||||
{
|
||||
if (Robots::allowed($this->url)) {
|
||||
$this->remoteFeed = simplexml_load_file($this->url);
|
||||
} else {
|
||||
throw new \Exception("Robots.txt disallows fetching this feed");
|
||||
}
|
||||
}
|
||||
|
||||
private function test_item(\DOMNode $item): bool {
|
||||
foreach ($this->feedFilters as $filter) {
|
||||
if (!$filter->execute(simplexml_import_dom($item))) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
public function filter() : void
|
||||
{
|
||||
if ($this->remoteFeed === null) {
|
||||
$this->fetch();
|
||||
}
|
||||
|
||||
$dom = dom_import_simplexml($this->remoteFeed);
|
||||
$doc = $dom->ownerDocument;
|
||||
|
||||
$xpath = new DOMXPath($doc);
|
||||
$filter_queries = implode("|", array_map(fn($filter) => $filter->to_xpath(), $this->feedFilters->toArray()));
|
||||
|
||||
$xpath_query = '//item[' . $filter_queries . ']';
|
||||
|
||||
$to_remove = $xpath->query($xpath_query);
|
||||
echo "Removing " . $to_remove->length . " items\n";
|
||||
foreach ($to_remove as $item) {
|
||||
$item->parentNode->removeChild($item);
|
||||
}
|
||||
|
||||
$this->filteredFeed = simplexml_import_dom($dom);
|
||||
}
|
||||
}
|
@ -13,6 +13,24 @@ use Ramsey\Uuid\UuidInterface;
|
||||
#[ORM\HasLifecycleCallbacks]
|
||||
class FeedFilter
|
||||
{
|
||||
public function __construct(
|
||||
FilterTarget $target,
|
||||
FilterType $filter,
|
||||
string $value,
|
||||
Feed $feed,
|
||||
UuidInterface $id = null,
|
||||
)
|
||||
{
|
||||
if ($id !== null) {
|
||||
$this->id = $id;
|
||||
}
|
||||
|
||||
$this->target = $target;
|
||||
$this->filter = $filter;
|
||||
$this->value = $value;
|
||||
$this->feed = $feed;
|
||||
}
|
||||
|
||||
#[ORM\Id]
|
||||
#[ORM\Column(type: "uuid", unique: true)]
|
||||
#[ORM\GeneratedValue(strategy: "CUSTOM")]
|
||||
@ -33,7 +51,30 @@ class FeedFilter
|
||||
#[ORM\Column(type: 'string')]
|
||||
public string $value;
|
||||
|
||||
#[ORM\ManyToOne(targetEntity: Feed::class)]
|
||||
#[ORM\ManyToOne(targetEntity: Feed::class, cascade: ['persist', 'remove'], inversedBy: 'feedFilters')]
|
||||
#[ORM\JoinColumn(name: 'feed_id', referencedColumnName: 'id')]
|
||||
public Feed $feed;
|
||||
|
||||
public function to_xpath(): string {
|
||||
return match ($this->filter) {
|
||||
FilterType::EXACT => "{$this->target->value}='{$this->value}'",
|
||||
FilterType::INCLUDE => "{$this->target->value}[not(contains(text(), \"{$this->value}\"))]",
|
||||
FilterType::EXCLUDE => "{$this->target->value}[contains(text(), \"{$this->value}\")]",
|
||||
};
|
||||
}
|
||||
|
||||
public function execute(\SimpleXMLElement $item): bool {
|
||||
$value = (string) match ($this->target) {
|
||||
FilterTarget::TITLE => $item->title,
|
||||
FilterTarget::DESCRIPTION => $item->description,
|
||||
FilterTarget::LINK => $item->link,
|
||||
};
|
||||
|
||||
return match ($this->filter) {
|
||||
FilterType::INCLUDE => str_contains(strtolower($value), strtolower($this->value)),
|
||||
FilterType::EXACT => $value === $this->value,
|
||||
FilterType::REGEX => preg_match($this->value, $value),
|
||||
FilterType::EXCLUDE => !str_contains($value, $this->value),
|
||||
};
|
||||
}
|
||||
}
|
40
src/Models/Types/XMLElement.php
Normal file
40
src/Models/Types/XMLElement.php
Normal file
@ -0,0 +1,40 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
|
||||
namespace Lewisdale\App\Models\Types;
|
||||
|
||||
use Doctrine\DBAL\Platforms\AbstractPlatform;
|
||||
use Doctrine\DBAL\Types\Type;
|
||||
|
||||
class XMLElement extends Type
|
||||
{
|
||||
const NAME = 'xml_element';
|
||||
|
||||
public function getSQLDeclaration(array $column, AbstractPlatform $platform): string
|
||||
{
|
||||
return 'TEXT';
|
||||
}
|
||||
|
||||
public function convertToPHPValue($value, AbstractPlatform $platform)
|
||||
{
|
||||
if ($value === null) {
|
||||
return null;
|
||||
}
|
||||
return simplexml_load_string($value);
|
||||
}
|
||||
|
||||
public function convertToDatabaseValue($value, AbstractPlatform $platform)
|
||||
{
|
||||
if ($value instanceof \SimpleXMLElement) {
|
||||
return $value->asXML();
|
||||
} else {
|
||||
return $value;
|
||||
}
|
||||
}
|
||||
|
||||
public function getName(): string
|
||||
{
|
||||
return self::NAME;
|
||||
}
|
||||
}
|
89
src/Requests/Robots.php
Normal file
89
src/Requests/Robots.php
Normal file
@ -0,0 +1,89 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
|
||||
namespace Lewisdale\App\Requests;
|
||||
|
||||
class Robots
|
||||
{
|
||||
// Original PHP code by Chirp Internet: www.chirpinternet.eu
|
||||
// Adapted to include 404 and Allow directive checking by Eric at LinkUp.com
|
||||
// Please acknowledge use of this code by including this header.
|
||||
public static function allowed(string $url, string | null $useragent = "Baleen"): bool
|
||||
{
|
||||
// parse url to retrieve host and path
|
||||
$parsed = parse_url($url);
|
||||
|
||||
$agents = [preg_quote('*')];
|
||||
if ($useragent) {
|
||||
$agents[] = preg_quote($useragent, '/');
|
||||
}
|
||||
$agents = implode('|', $agents);
|
||||
|
||||
// location of robots.txt file, only pay attention to it if the server says it exists
|
||||
if (function_exists('curl_init')) {
|
||||
$handle = curl_init("http://{$parsed['host']}/robots.txt");
|
||||
curl_setopt($handle, CURLOPT_RETURNTRANSFER, TRUE);
|
||||
$response = curl_exec($handle);
|
||||
$httpCode = curl_getinfo($handle, CURLINFO_HTTP_CODE);
|
||||
if (200 == $httpCode) {
|
||||
$robots_txt = explode("\n", $response);
|
||||
} else {
|
||||
$robots_txt = FALSE;
|
||||
}
|
||||
curl_close($handle);
|
||||
} else {
|
||||
$robots_txt = @file("http://{$parsed['host']}/robots.txt");
|
||||
}
|
||||
|
||||
// if there isn't a robots, then we're allowed in
|
||||
if (empty($robots_txt)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
$rules = [];
|
||||
$rule_applies = FALSE;
|
||||
|
||||
foreach ($robots_txt as $line) {
|
||||
// skip blank lines
|
||||
if (!$line = trim($line)) continue;
|
||||
|
||||
// following rules only apply if User-agent matches $useragent or '*'
|
||||
if (preg_match('/^\s*User-agent: (.*)/i', $line, $match)) {
|
||||
$rule_applies = preg_match("/($agents)/i", $match[1]);
|
||||
continue;
|
||||
}
|
||||
|
||||
if ($rule_applies) {
|
||||
list($type, $rule) = explode(':', $line, 2);
|
||||
$type = trim(strtolower($type));
|
||||
// add rules that apply to array for testing
|
||||
$rules[] = [
|
||||
'type' => $type,
|
||||
'match' => preg_quote(trim($rule), '/'),
|
||||
];
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
$isAllowed = TRUE;
|
||||
$currentStrength = 0;
|
||||
|
||||
foreach ($rules as $rule) {
|
||||
// check if page hits on a rule
|
||||
if (preg_match("/^{$rule['match']}/", $parsed['path'])) {
|
||||
// prefer longer (more specific) rules and Allow trumps Disallow if rules same length
|
||||
$strength = strlen($rule['match']);
|
||||
if ($currentStrength < $strength) {
|
||||
$currentStrength = $strength;
|
||||
$isAllowed = ("allow" == $rule['type']);
|
||||
} elseif ($currentStrength == $strength && ("allow" == $rule['type'])) {
|
||||
$currentStrength = $strength;
|
||||
$isAllowed = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return $isAllowed;
|
||||
}
|
||||
}
|
79
src/Tools/Console/TestFeed.php
Normal file
79
src/Tools/Console/TestFeed.php
Normal file
@ -0,0 +1,79 @@
|
||||
<?php
|
||||
declare(strict_types=1);
|
||||
|
||||
|
||||
namespace Lewisdale\App\Tools\Console;
|
||||
|
||||
use Symfony\Component\Console\{Attribute\AsCommand,
|
||||
Command\Command,
|
||||
Input\InputArgument,
|
||||
Input\InputInterface,
|
||||
Output\OutputInterface};
|
||||
use Doctrine\Common\Collections\ArrayCollection;
|
||||
use Doctrine\ORM\EntityManager;
|
||||
use Lewisdale\App\Models\Data\Feed;
|
||||
use Lewisdale\App\Models\Data\FeedFilter;
|
||||
use Lewisdale\App\Models\Data\FilterTarget;
|
||||
use Lewisdale\App\Models\Data\FilterType;
|
||||
use Lewisdale\App\Models\Repositories\FeedRepository;
|
||||
use function DI\add;
|
||||
|
||||
#[AsCommand(name: "test:feed")]
|
||||
class TestFeed extends Command
|
||||
{
|
||||
public function __construct(private readonly EntityManager $em,
|
||||
private readonly FeedRepository $feedRepository)
|
||||
{
|
||||
parent::__construct();
|
||||
}
|
||||
|
||||
protected function configure(): void
|
||||
{
|
||||
$this
|
||||
->addArgument("url", InputArgument::REQUIRED, "The URL of the feed to test")
|
||||
->addArgument("title", InputArgument::REQUIRED, "The title of the feed to test");
|
||||
}
|
||||
|
||||
protected function execute(InputInterface $input, OutputInterface $output): int
|
||||
{
|
||||
$output->writeln([
|
||||
"Testing Feed",
|
||||
"============",
|
||||
]);
|
||||
|
||||
$url = $input->getArgument("url");
|
||||
$title = $input->getArgument("title");
|
||||
|
||||
// $output->writeln("Setting up a new feed for: $url");
|
||||
//
|
||||
// $feed = new Feed();
|
||||
// $feed->url = $url;
|
||||
// $feed->title = $title;
|
||||
// $feed->fetch();
|
||||
// $feed->feedFilters = new ArrayCollection();
|
||||
// $feed->feedFilters->add( new FeedFilter(FilterTarget::TITLE, FilterType::INCLUDE, "[No Ads]", $feed));
|
||||
//
|
||||
// $this->em->persist($feed);
|
||||
// $this->em->flush();
|
||||
//
|
||||
// $cnt = $this->feedRepository->count([]);
|
||||
// $output->writeln("Feed count: $cnt");
|
||||
|
||||
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
|
||||
$output->writeln("Feed title: " . $saved_feed->title);
|
||||
|
||||
$s2_filter = new FeedFilter(FilterTarget::TITLE, FilterType::EXCLUDE, "S2", $saved_feed);
|
||||
$this->em->persist($s2_filter);
|
||||
$this->em->flush();
|
||||
|
||||
$saved_feed = $this->feedRepository->findOneBy(['url' => $url, 'title' => $title]);
|
||||
$saved_feed->filter();
|
||||
|
||||
$this->em->persist($saved_feed);
|
||||
$this->em->flush();
|
||||
|
||||
$output->writeln("Filtered feed: " . $saved_feed->filteredFeed->asXML('feed_filtered.xml'));
|
||||
|
||||
return Command::SUCCESS;
|
||||
}
|
||||
}
|
@ -4,6 +4,8 @@ use Lewisdale\App\Controllers\FeedController;
|
||||
use Lewisdale\App\Controllers\SampleController;
|
||||
use Slim\Views\TwigMiddleware;
|
||||
|
||||
ini_set('user_agent', 'Baleen/1.0 (https://lewisdale.dev)');
|
||||
|
||||
require_once __DIR__ . "/dependencies.php";
|
||||
|
||||
global $container;
|
||||
|
@ -6,6 +6,7 @@ use Doctrine\DBAL\Types\Type;
|
||||
use Doctrine\ORM\EntityManager;
|
||||
use Doctrine\ORM\ORMSetup;
|
||||
use Lewisdale\App\Logging\FileLogger;
|
||||
use Lewisdale\App\Models\Types\XMLElement;
|
||||
use Lewisdale\App\TwigExtensions\CsrfExtension;
|
||||
use Psr\Log\LoggerInterface;
|
||||
use Ramsey\Uuid\Doctrine\UuidType;
|
||||
@ -40,6 +41,7 @@ $container->set(LoggerInterface::class, $container->get(FileLogger::class));
|
||||
|
||||
$container->set(EntityManager::class, static function() {
|
||||
Type::addType('uuid', UuidType::class);
|
||||
Type::addType(XMLElement::NAME, XMLElement::class);
|
||||
|
||||
$config = ORMSetup::createAttributeMetadataConfiguration(
|
||||
paths: array(__DIR__."/Models/Data"),
|
||||
@ -51,6 +53,8 @@ $container->set(EntityManager::class, static function() {
|
||||
'path' => __DIR__ . '/../' . getenv("SQLITE_DB_NAME"),
|
||||
], $config);
|
||||
|
||||
$connection->executeQuery("PRAGMA foreign_keys = ON");
|
||||
|
||||
return new EntityManager($connection, $config);
|
||||
});
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user