Automatically detect character encoding of CSV files when processsing them

to handle non-UTF-8 file types. Added a new test case and enhanced the test
rigs to be able to write non-UTF-8 files.

Final cleanup
This commit is contained in:
Brady Wetherington 2024-11-06 14:30:46 +00:00
parent 9830959f11
commit a50c8c6269
5 changed files with 120 additions and 3 deletions

View file

@ -13,6 +13,7 @@ use Illuminate\Support\Facades\Auth;
use Illuminate\Support\Facades\DB;
use League\Csv\Reader;
use Illuminate\Support\Facades\Log;
use Onnov\DetectEncoding\EncodingDetector;
abstract class Importer
{
@ -124,11 +125,28 @@ abstract class Importer
if (! ini_get('auto_detect_line_endings')) {
ini_set('auto_detect_line_endings', '1');
}
$detector = new EncodingDetector();
// By default the importer passes a url to the file.
// However, for testing we also support passing a string directly
if (is_file($file)) {
$file_contents = file_get_contents($file); // TODO - this loads up the file in memory! Which could be 'big' and thus, this could be 'bad'
} else {
$file_contents = $file;
}
$encoding = $detector->getEncoding($file_contents);
\Log::debug("DETECTED ENCODING IS: $encoding");
$file_contents = null; //try to save some memory?
if (is_file($file)) {
if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) {
$file = "php://filter/convert.iconv.$encoding.utf-8/resource=".$file;
}
$this->csv = Reader::createFromPath($file);
} else {
//we already have the string, so do the conversion directly here?
if ($encoding && strcasecmp($encoding, 'UTF-8') != 0) {
$file = iconv($encoding, 'UTF-8', $file);
}
$this->csv = Reader::createFromString($file);
}
$this->tempPassword = substr(str_shuffle('0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'), 0, 40);

View file

@ -20,6 +20,7 @@
"php": "^8.1",
"ext-curl": "*",
"ext-fileinfo": "*",
"ext-iconv": "*",
"ext-json": "*",
"ext-mbstring": "*",
"ext-pdo": "*",
@ -55,6 +56,7 @@
"nunomaduro/collision": "^7.0",
"okvpn/clock-lts": "^1.0",
"onelogin/php-saml": "^3.4",
"onnov/detect-encoding": "^2.0",
"osa-eg/laravel-teams-notification": "^2.1",
"paragonie/constant_time_encoding": "^2.3",
"paragonie/sodium_compat": "^1.19",

67
composer.lock generated
View file

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "0750e3a427347b2a56a05a8b9b533d48",
"content-hash": "2a6e7f5e039ee2f40605aefc5c5baf08",
"packages": [
{
"name": "alek13/slack",
@ -5574,6 +5574,70 @@
],
"time": "2024-05-30T15:14:26+00:00"
},
{
"name": "onnov/detect-encoding",
"version": "v2.0.0",
"source": {
"type": "git",
"url": "https://github.com/onnov/detect-encoding.git",
"reference": "6a8159ac3e6178ae043244b9d66a9b2701121e07"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/onnov/detect-encoding/zipball/6a8159ac3e6178ae043244b9d66a9b2701121e07",
"reference": "6a8159ac3e6178ae043244b9d66a9b2701121e07",
"shasum": ""
},
"require": {
"ext-iconv": "*",
"php": ">=7.3"
},
"require-dev": {
"infection/infection": "*",
"phpbench/phpbench": "*",
"phpcompatibility/php-compatibility": "*",
"phpmd/phpmd": "*",
"phpstan/phpstan": "*",
"phpstan/phpstan-strict-rules": "*",
"phpunit/phpunit": "*",
"roave/backward-compatibility-check": "*",
"squizlabs/php_codesniffer": "*"
},
"type": "library",
"autoload": {
"psr-4": {
"Onnov\\DetectEncoding\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "onnov",
"email": "oblnn@yandex.ru"
}
],
"description": "Text encoding definition class instead of mb_detect_encoding. Defines: utf-8, windows-1251, koi8-r, iso-8859-5, ibm866, .....",
"homepage": "https://github.com/onnov/detect-encoding",
"keywords": [
"cyrillic",
"encoding",
"ibm866",
"iconv",
"iso-8859-5",
"koi8-r",
"mb_detect_encoding",
"utf-8",
"windows-1251"
],
"support": {
"issues": "https://github.com/onnov/detect-encoding/issues",
"source": "https://github.com/onnov/detect-encoding/tree/v2.0.0"
},
"time": "2021-01-04T14:29:34+00:00"
},
{
"name": "osa-eg/laravel-teams-notification",
"version": "v2.1.2",
@ -16570,6 +16634,7 @@
"php": "^8.1",
"ext-curl": "*",
"ext-fileinfo": "*",
"ext-iconv": "*",
"ext-json": "*",
"ext-mbstring": "*",
"ext-pdo": "*"

View file

@ -141,6 +141,32 @@ class ImportAssetsTest extends ImportDataTestCase implements TestsPermissionsReq
}
#[Test]
public function importInternationalAsset(): void
{
$evil_string = 'blähÅÄÖ'; //'це; //first one is cyrllic? so is second.
$evil_string = 'це'; //cyrliccic - windows-1251 (ONE)
//copypasta the thing? well, the important bits?
$importFileBuilder = ImportFileBuilder::new(['itemName' => $evil_string]); //not 'name'
$row = $importFileBuilder->firstRow();
$import = Import::factory()->asset()->create(['file_path' => $importFileBuilder->saveToImportsDirectory(null, 'WINDOWS-1251')]);
$this->actingAsForApi(User::factory()->superuser()->create());
$this->importFileResponse(['import' => $import->id])
->assertOk()
->assertExactJson([
'payload' => null,
'status' => 'success',
'messages' => ['redirect_url' => route('hardware.index')]
]);
$newAsset = Asset::query()
->with(['location', 'supplier', 'company', 'assignedAssets', 'defaultLoc', 'assetStatus', 'model.category', 'model.manufacturer'])
->where('serial', $row['serialNumber'])
->sole();
$this->assertEquals($evil_string, $newAsset->name);
}
#[Test]
public function willIgnoreUnknownColumnsWhenFileContainsUnknownColumns(): void
{

View file

@ -206,7 +206,7 @@ abstract class FileBuilder
*
* @return string The filename.
*/
public function saveToImportsDirectory(?string $filename = null): string
public function saveToImportsDirectory(?string $filename = null, ?string $locale = null): string
{
$filename ??= Str::random(40) . '.csv';
@ -214,9 +214,15 @@ abstract class FileBuilder
$stream = fopen(config('app.private_uploads') . "/imports/{$filename}", 'w');
foreach ($this->toCsv() as $row) {
if ($locale) {
$newrow = [];
foreach ($row as $index => $cell) {
$newrow[$index] = iconv('utf-8', $locale, (string) $cell);
}
$row = $newrow;
}
fputcsv($stream, $row);
}
return $filename;
} finally {
if (is_resource($stream)) {